diff options
Diffstat (limited to 'offload')
92 files changed, 2867 insertions, 6029 deletions
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt index 38fa77e41bb5..b27738078350 100644 --- a/offload/CMakeLists.txt +++ b/offload/CMakeLists.txt @@ -4,7 +4,8 @@ cmake_minimum_required(VERSION 3.20.0) set(LLVM_SUBPROJECT_TITLE "liboffload") -if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") +# Permit redefining OPENMP_STANDALONE_BUILD when doing a runtimes build. +if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") set(OPENMP_STANDALONE_BUILD TRUE) project(offload C CXX ASM) else() @@ -371,7 +372,6 @@ add_subdirectory(tools/offload-tblgen) # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins-nextgen) -add_subdirectory(DeviceRTL) add_subdirectory(tools) add_subdirectory(docs) diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h deleted file mode 100644 index dc4d029ed75f..000000000000 --- a/offload/DeviceRTL/include/Allocator.h +++ /dev/null @@ -1,45 +0,0 @@ -//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_ALLOCATOR_H -#define OMPTARGET_ALLOCATOR_H - -#include "DeviceTypes.h" - -// Forward declaration. -struct KernelEnvironmentTy; - -namespace ompx { - -namespace allocator { - -static uint64_t constexpr ALIGNMENT = 16; - -/// Initialize the allocator according to \p KernelEnvironment -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment); - -/// Allocate \p Size bytes. -[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void * -alloc(uint64_t Size); - -/// Free the allocation pointed to by \p Ptr. -void free(void *Ptr); - -} // namespace allocator - -} // namespace ompx - -extern "C" { -void *malloc(size_t Size); -void free(void *Ptr); -} - -#endif diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h deleted file mode 100644 index 95408933dd86..000000000000 --- a/offload/DeviceRTL/include/Configuration.h +++ /dev/null @@ -1,68 +0,0 @@ -//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// API to query the global (constant) device environment. -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_CONFIGURATION_H -#define OMPTARGET_CONFIGURATION_H - -#include "Shared/Environment.h" - -#include "DeviceTypes.h" - -namespace ompx { -namespace config { - -/// Return the number of devices in the system, same number as returned on the -/// host by omp_get_num_devices. -uint32_t getNumDevices(); - -/// Return the device number in the system for omp_get_device_num. -uint32_t getDeviceNum(); - -/// Return the user chosen debug level. -uint32_t getDebugKind(); - -/// Return if teams oversubscription is assumed -uint32_t getAssumeTeamsOversubscription(); - -/// Return if threads oversubscription is assumed -uint32_t getAssumeThreadsOversubscription(); - -/// Return the amount of dynamic shared memory that was allocated at launch. -uint64_t getDynamicMemorySize(); - -/// Returns the cycles per second of the device's fixed frequency clock. -uint64_t getClockFrequency(); - -/// Returns the pointer to the beginning of the indirect call table. -void *getIndirectCallTablePtr(); - -/// Returns the size of the indirect call table. -uint64_t getIndirectCallTableSize(); - -/// Returns the size of the indirect call table. -uint64_t getHardwareParallelism(); - -/// Return if debugging is enabled for the given debug kind. -bool isDebugMode(DeviceDebugKind Level); - -/// Indicates if this kernel may require thread-specific states, or if it was -/// explicitly disabled by the user. -bool mayUseThreadStates(); - -/// Indicates if this kernel may require data environments for nested -/// parallelism, or if it was explicitly disabled by the user. -bool mayUseNestedParallelism(); - -} // namespace config -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Debug.h b/offload/DeviceRTL/include/Debug.h deleted file mode 100644 index 98d0fa498d95..000000000000 --- a/offload/DeviceRTL/include/Debug.h +++ /dev/null @@ -1,44 +0,0 @@ -//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_DEBUG_H -#define OMPTARGET_DEVICERTL_DEBUG_H - -#include "Configuration.h" -#include "LibC.h" - -/// Assertion -/// -/// { -extern "C" { -void __assert_assume(bool condition); -void __assert_fail(const char *expr, const char *file, unsigned line, - const char *function); -void __assert_fail_internal(const char *expr, const char *msg, const char *file, - unsigned line, const char *function); -} - -#define ASSERT(expr, msg) \ - { \ - if (config::isDebugMode(DeviceDebugKind::Assertion) && !(expr)) \ - __assert_fail_internal(#expr, msg, __FILE__, __LINE__, \ - __PRETTY_FUNCTION__); \ - else \ - __assert_assume(expr); \ - } -#define UNREACHABLE(msg) \ - printf(msg); \ - __builtin_trap(); \ - __builtin_unreachable(); - -///} - -#endif diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h deleted file mode 100644 index 2e5d92380f04..000000000000 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ /dev/null @@ -1,166 +0,0 @@ -//===---------- DeviceTypes.h - OpenMP types ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_TYPES_H -#define OMPTARGET_TYPES_H - -#include <gpuintrin.h> -#include <stddef.h> -#include <stdint.h> - -template <typename T> using Private = __gpu_private T; -template <typename T> using Constant = __gpu_constant T; -template <typename T> using Local = __gpu_local T; -template <typename T> using Global = __gpu_local T; - -enum omp_proc_bind_t { - omp_proc_bind_false = 0, - omp_proc_bind_true = 1, - omp_proc_bind_master = 2, - omp_proc_bind_close = 3, - omp_proc_bind_spread = 4 -}; - -enum omp_sched_t { - omp_sched_static = 1, /* chunkSize >0 */ - omp_sched_dynamic = 2, /* chunkSize >0 */ - omp_sched_guided = 3, /* chunkSize >0 */ - omp_sched_auto = 4, /* no chunkSize */ -}; - -enum kmp_sched_t { - kmp_sched_static_chunk = 33, - kmp_sched_static_nochunk = 34, - kmp_sched_dynamic = 35, - kmp_sched_guided = 36, - kmp_sched_runtime = 37, - kmp_sched_auto = 38, - - kmp_sched_static_balanced_chunk = 45, - - kmp_sched_static_ordered = 65, - kmp_sched_static_nochunk_ordered = 66, - kmp_sched_dynamic_ordered = 67, - kmp_sched_guided_ordered = 68, - kmp_sched_runtime_ordered = 69, - kmp_sched_auto_ordered = 70, - - kmp_sched_distr_static_chunk = 91, - kmp_sched_distr_static_nochunk = 92, - kmp_sched_distr_static_chunk_sched_static_chunkone = 93, - - kmp_sched_default = kmp_sched_static_nochunk, - kmp_sched_unordered_first = kmp_sched_static_chunk, - kmp_sched_unordered_last = kmp_sched_auto, - kmp_sched_ordered_first = kmp_sched_static_ordered, - kmp_sched_ordered_last = kmp_sched_auto_ordered, - kmp_sched_distribute_first = kmp_sched_distr_static_chunk, - kmp_sched_distribute_last = - kmp_sched_distr_static_chunk_sched_static_chunkone, - - /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. - * Since we need to distinguish the three possible cases (no modifier, - * monotonic modifier, nonmonotonic modifier), we need separate bits for - * each modifier. The absence of monotonic does not imply nonmonotonic, - * especially since 4.5 says that the behaviour of the "no modifier" case - * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. - * - * Since we're passing a full 32 bit value, we can use a couple of high - * bits for these flags; out of paranoia we avoid the sign bit. - * - * These modifiers can be or-ed into non-static schedules by the compiler - * to pass the additional information. They will be stripped early in the - * processing in __kmp_dispatch_init when setting up schedules, so - * most of the code won't ever see schedules with these bits set. - */ - kmp_sched_modifier_monotonic = (1 << 29), - /**< Set if the monotonic schedule modifier was present */ - kmp_sched_modifier_nonmonotonic = (1 << 30), -/**< Set if the nonmonotonic schedule modifier was present */ - -#define SCHEDULE_WITHOUT_MODIFIERS(s) \ - (enum kmp_sched_t)( \ - (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) -#define SCHEDULE_HAS_MONOTONIC(s) (((s) & kmp_sched_modifier_monotonic) != 0) -#define SCHEDULE_HAS_NONMONOTONIC(s) \ - (((s) & kmp_sched_modifier_nonmonotonic) != 0) -#define SCHEDULE_HAS_NO_MODIFIERS(s) \ - (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ - 0) - -}; - -struct TaskDescriptorTy; -using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr); -struct TaskDescriptorTy { - void *Payload; - TaskFnTy TaskFn; -}; - -using LaneMaskTy = uint64_t; - -namespace lanes { -enum : LaneMaskTy { All = ~(LaneMaskTy)0 }; -} // namespace lanes - -/// The ident structure that describes a source location. The struct is -/// identical to the one in the kmp.h file. We maintain the same data structure -/// for compatibility. -struct IdentTy { - int32_t reserved_1; /**< might be used in Fortran; see above */ - int32_t flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC - identifies this union member */ - int32_t reserved_2; /**< not really used in Fortran any more; see above */ - int32_t reserved_3; /**< source[4] in Fortran, do not use for C++ */ - char const *psource; /**< String describing the source location. - The string is composed of semi-colon separated fields - which describe the source file, the function and a pair - of line numbers that delimit the construct. */ -}; - -using __kmpc_impl_lanemask_t = LaneMaskTy; - -using ParallelRegionFnTy = void *; - -using CriticalNameTy = int32_t[8]; - -struct omp_lock_t { - void *Lock; -}; - -using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num); -using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id, - int16_t lane_offset, int16_t shortCircuit); -using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data); - -/// Macros for allocating variables in different address spaces. -///{ - -// Follows the pattern in interface.h -typedef enum omp_allocator_handle_t { - omp_null_allocator = 0, - omp_default_mem_alloc = 1, - omp_large_cap_mem_alloc = 2, - omp_const_mem_alloc = 3, - omp_high_bw_mem_alloc = 4, - omp_low_lat_mem_alloc = 5, - omp_cgroup_mem_alloc = 6, - omp_pteam_mem_alloc = 7, - omp_thread_mem_alloc = 8, - KMP_ALLOCATOR_MAX_HANDLE = ~(0LU) -} omp_allocator_handle_t; - -#define __PRAGMA(STR) _Pragma(#STR) -#define OMP_PRAGMA(STR) __PRAGMA(omp STR) - -///} - -#endif diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h deleted file mode 100644 index b92514ee9838..000000000000 --- a/offload/DeviceRTL/include/DeviceUtils.h +++ /dev/null @@ -1,96 +0,0 @@ -//===--- DeviceUtils.h - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H -#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H - -#include "DeviceTypes.h" -#include "Shared/Utils.h" - -namespace utils { - -template <typename T> struct type_identity { - using type = T; -}; - -template <typename T, T v> struct integral_constant { - inline static constexpr T value = v; -}; - -/// Freestanding SFINAE helpers. -template <class T> struct remove_cv : type_identity<T> {}; -template <class T> struct remove_cv<const T> : type_identity<T> {}; -template <class T> struct remove_cv<volatile T> : type_identity<T> {}; -template <class T> struct remove_cv<const volatile T> : type_identity<T> {}; -template <class T> using remove_cv_t = typename remove_cv<T>::type; - -using true_type = integral_constant<bool, true>; -using false_type = integral_constant<bool, false>; - -template <typename T, typename U> struct is_same : false_type {}; -template <typename T> struct is_same<T, T> : true_type {}; -template <typename T, typename U> -inline constexpr bool is_same_v = is_same<T, U>::value; - -template <typename T> struct is_floating_point { - inline static constexpr bool value = - is_same_v<remove_cv_t<T>, float> || is_same_v<remove_cv_t<T>, double>; -}; -template <typename T> -inline constexpr bool is_floating_point_v = is_floating_point<T>::value; - -template <bool B, typename T = void> struct enable_if; -template <typename T> struct enable_if<true, T> : type_identity<T> {}; -template <bool B, typename T = void> -using enable_if_t = typename enable_if<B, T>::type; - -template <class T> struct remove_addrspace : type_identity<T> {}; -template <class T, int N> -struct remove_addrspace<T [[clang::address_space(N)]]> : type_identity<T> {}; -template <class T> -using remove_addrspace_t = typename remove_addrspace<T>::type; - -template <typename To, typename From> inline To bitCast(From V) { - static_assert(sizeof(To) == sizeof(From), "Bad conversion"); - return __builtin_bit_cast(To, V); -} - -/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread -/// is identified by \p Mask. -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); - -int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); - -uint64_t ballotSync(uint64_t Mask, int32_t Pred); - -/// Return \p LowBits and \p HighBits packed into a single 64 bit value. -uint64_t pack(uint32_t LowBits, uint32_t HighBits); - -/// Unpack \p Val into \p LowBits and \p HighBits. -void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); - -/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). -bool isSharedMemPtr(void *Ptr); - -/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)). -bool isThreadLocalMemPtr(void *Ptr); - -/// A pointer variable that has by design an `undef` value. Use with care. -[[clang::loader_uninitialized]] static void *const UndefPtr; - -#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) -#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) - -} // namespace utils - -#endif diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h deleted file mode 100644 index c4bfaaa2404b..000000000000 --- a/offload/DeviceRTL/include/Interface.h +++ /dev/null @@ -1,366 +0,0 @@ -//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_INTERFACE_H -#define OMPTARGET_DEVICERTL_INTERFACE_H - -#include "Shared/Environment.h" - -#include "DeviceTypes.h" - -/// External API -/// -///{ - -extern "C" { - -/// ICV: dyn-var, constant 0 -/// -/// setter: ignored. -/// getter: returns 0. -/// -///{ -void omp_set_dynamic(int); -int omp_get_dynamic(void); -///} - -/// ICV: nthreads-var, integer -/// -/// scope: data environment -/// -/// setter: ignored. -/// getter: returns false. -/// -/// implementation notes: -/// -/// -///{ -void omp_set_num_threads(int); -int omp_get_max_threads(void); -///} - -/// ICV: thread-limit-var, computed -/// -/// getter: returns thread limited defined during launch. -/// -///{ -int omp_get_thread_limit(void); -///} - -/// ICV: max-active-level-var, constant 1 -/// -/// setter: ignored. -/// getter: returns 1. -/// -///{ -void omp_set_max_active_levels(int); -int omp_get_max_active_levels(void); -///} - -/// ICV: places-partition-var -/// -/// -///{ -///} - -/// ICV: active-level-var, 0 or 1 -/// -/// getter: returns 0 or 1. -/// -///{ -int omp_get_active_level(void); -///} - -/// ICV: level-var -/// -/// getter: returns parallel region nesting -/// -///{ -int omp_get_level(void); -///} - -/// ICV: run-sched-var -/// -/// -///{ -void omp_set_schedule(omp_sched_t, int); -void omp_get_schedule(omp_sched_t *, int *); -///} - -/// TODO this is incomplete. -int omp_get_num_threads(void); -int omp_get_thread_num(void); -void omp_set_nested(int); - -int omp_get_nested(void); - -void omp_set_max_active_levels(int Level); - -int omp_get_max_active_levels(void); - -omp_proc_bind_t omp_get_proc_bind(void); - -int omp_get_num_places(void); - -int omp_get_place_num_procs(int place_num); - -void omp_get_place_proc_ids(int place_num, int *ids); - -int omp_get_place_num(void); - -int omp_get_partition_num_places(void); - -void omp_get_partition_place_nums(int *place_nums); - -int omp_get_cancellation(void); - -void omp_set_default_device(int deviceId); - -int omp_get_default_device(void); - -int omp_get_num_devices(void); - -int omp_get_device_num(void); - -int omp_get_num_teams(void); - -int omp_get_team_num(); - -int omp_get_initial_device(void); - -void *llvm_omp_target_dynamic_shared_alloc(); - -/// Synchronization -/// -///{ -void omp_init_lock(omp_lock_t *Lock); - -void omp_destroy_lock(omp_lock_t *Lock); - -void omp_set_lock(omp_lock_t *Lock); - -void omp_unset_lock(omp_lock_t *Lock); - -int omp_test_lock(omp_lock_t *Lock); -///} - -/// Tasking -/// -///{ -int omp_in_final(void); - -int omp_get_max_task_priority(void); -///} - -/// Misc -/// -///{ -double omp_get_wtick(void); - -double omp_get_wtime(void); -///} -} - -extern "C" { -/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be -/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be -/// called by any thread, allocation happens *per thread*. -void *__kmpc_alloc_shared(uint64_t Bytes); - -/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like -/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the -/// allocated by __kmpc_alloc_shared by the same thread. -void __kmpc_free_shared(void *Ptr, uint64_t Bytes); - -/// Get a pointer to the memory buffer containing dynamically allocated shared -/// memory configured at launch. -void *__kmpc_get_dynamic_shared(); - -/// Allocate sufficient space for \p NumArgs sequential `void*` and store the -/// allocation address in \p GlobalArgs. -/// -/// Called by the main thread prior to a parallel region. -/// -/// We also remember it in GlobalArgsPtr to ensure the worker threads and -/// deallocation function know the allocation address too. -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs); - -/// Deallocate the memory allocated by __kmpc_begin_sharing_variables. -/// -/// Called by the main thread after a parallel region. -void __kmpc_end_sharing_variables(); - -/// Store the allocation address obtained via __kmpc_begin_sharing_variables in -/// \p GlobalArgs. -/// -/// Called by the worker threads in the parallel region (function). -void __kmpc_get_shared_variables(void ***GlobalArgs); - -/// External interface to get the thread ID. -uint32_t __kmpc_get_hardware_thread_id_in_block(); - -/// External interface to get the number of threads. -uint32_t __kmpc_get_hardware_num_threads_in_block(); - -/// External interface to get the warp size. -uint32_t __kmpc_get_warp_size(); - -/// Kernel -/// -///{ -// Forward declaration -struct KernelEnvironmentTy; - -int8_t __kmpc_is_spmd_exec_mode(); - -int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); - -void __kmpc_target_deinit(); - -///} - -/// Reduction -/// -///{ -void *__kmpc_reduction_get_fixed_buffer(); - -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, - uint64_t reduce_data_size, - void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct); - -int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, - uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, - ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct); -///} - -/// Synchronization -/// -///{ -void __kmpc_ordered(IdentTy *Loc, int32_t TId); - -void __kmpc_end_ordered(IdentTy *Loc, int32_t TId); - -int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId); - -void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId); - -void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId); - -void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId); - -int32_t __kmpc_master(IdentTy *Loc, int32_t TId); - -void __kmpc_end_master(IdentTy *Loc, int32_t TId); - -int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter); - -void __kmpc_end_masked(IdentTy *Loc, int32_t TId); - -int32_t __kmpc_single(IdentTy *Loc, int32_t TId); - -void __kmpc_end_single(IdentTy *Loc, int32_t TId); - -void __kmpc_flush(IdentTy *Loc); - -uint64_t __kmpc_warp_active_thread_mask(void); - -void __kmpc_syncwarp(uint64_t Mask); - -void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); - -void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); -///} - -/// Parallelism -/// -///{ -/// TODO -void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn); - -/// TODO -bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn); - -/// TODO -void __kmpc_kernel_end_parallel(); - -/// TODO -void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind); - -/// TODO -void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams, - int32_t ThreadLimit); - -/// TODO -uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t); - -///} - -/// Tasking -/// -///{ -TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, - size_t TaskSizeInclPrivateValues, - size_t SharedValuesSize, - TaskFnTy TaskFn); - -int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); - -int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int32_t, - void *, int32_t, void *); - -void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); - -void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); - -void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, - void *); - -void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId); - -void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId); - -int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int); - -int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId); - -void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int, - uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, - int32_t, uint64_t, void *); -///} - -/// Misc -/// -///{ -int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal); - -int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal); -///} - -/// Shuffle -/// -///{ -int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); -int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); - -///} -} - -#endif diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h deleted file mode 100644 index 94b5e6519606..000000000000 --- a/offload/DeviceRTL/include/LibC.h +++ /dev/null @@ -1,23 +0,0 @@ -//===--------- LibC.h - Simple implementation of libc functions --- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_LIBC_H -#define OMPTARGET_LIBC_H - -#include "DeviceTypes.h" - -namespace ompx { - -int printf(const char *Format, ...); - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h deleted file mode 100644 index 8ba018b5314a..000000000000 --- a/offload/DeviceRTL/include/Mapping.h +++ /dev/null @@ -1,108 +0,0 @@ -//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_MAPPING_H -#define OMPTARGET_MAPPING_H - -#include "DeviceTypes.h" - -namespace ompx { - -namespace mapping { - -enum { - DIM_X = __GPU_X_DIM, - DIM_Y = __GPU_Y_DIM, - DIM_Z = __GPU_Z_DIM, -}; - -inline constexpr uint32_t MaxThreadsPerTeam = 1024; - -/// Initialize the mapping machinery. -void init(bool IsSPMD); - -/// Return true if the kernel is executed in SPMD mode. -bool isSPMDMode(); - -/// Return true if the kernel is executed in generic mode. -bool isGenericMode(); - -/// Return true if the executing thread is the main thread in generic mode. -/// These functions will lookup state and it is required that that is OK for the -/// thread and location. See also `isInitialThreadInLevel0` for a stateless -/// alternative for certain situations, e.g. during initialization. -bool isMainThreadInGenericMode(); -bool isMainThreadInGenericMode(bool IsSPMD); - -/// Return true if this thread is the initial thread in parallel level 0. -/// -/// The thread for which this returns true should be used for single threaded -/// initialization tasks. We pick a special thread to ensure there are no -/// races between the initialization and the first read of initialized state. -bool isInitialThreadInLevel0(bool IsSPMD); - -/// Return true if the executing thread has the lowest Id of the active threads -/// in the warp. -bool isLeaderInWarp(); - -/// Return a mask describing all active threads in the warp. -LaneMaskTy activemask(); - -/// Return a mask describing all threads with a smaller Id in the warp. -LaneMaskTy lanemaskLT(); - -/// Return a mask describing all threads with a larger Id in the warp. -LaneMaskTy lanemaskGT(); - -/// Return the thread Id in the warp, in [0, getWarpSize()). -uint32_t getThreadIdInWarp(); - -/// Return the warp size, thus number of threads in the warp. -uint32_t getWarpSize(); - -/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()] -uint32_t getWarpIdInBlock(); - -/// Return the number of warps in the block. -uint32_t getNumberOfWarpsInBlock(); - -/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)). -uint32_t getThreadIdInBlock(int32_t Dim = DIM_X); - -/// Return the block size, thus number of threads in the block. -uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X); - -/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)). -uint32_t getBlockIdInKernel(int32_t Dim = DIM_X); - -/// Return the number of blocks in the kernel. -uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X); - -/// Return the kernel size, thus number of threads in the kernel. -uint32_t getNumberOfThreadsInKernel(); - -/// Return the maximal number of threads in the block usable for a team (= -/// parallel region). -/// -/// Note: The version taking \p IsSPMD mode explicitly can be used during the -/// initialization of the target region, that is before `mapping::isSPMDMode()` -/// can be called by any thread other than the main one. -uint32_t getMaxTeamThreads(); -uint32_t getMaxTeamThreads(bool IsSPMD); - -/// Return the number of processing elements on the device. -uint32_t getNumberOfProcessorElements(); - -} // namespace mapping - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h deleted file mode 100644 index d99475225412..000000000000 --- a/offload/DeviceRTL/include/Profiling.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_PROFILING_H -#define OMPTARGET_DEVICERTL_PROFILING_H - -extern "C" { -void __llvm_profile_register_function(void *Ptr); -void __llvm_profile_register_names_function(void *Ptr, long int I); -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2); -} - -#endif diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h deleted file mode 100644 index db396dae6e44..000000000000 --- a/offload/DeviceRTL/include/State.h +++ /dev/null @@ -1,377 +0,0 @@ -//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_STATE_H -#define OMPTARGET_STATE_H - -#include "Shared/Environment.h" - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Mapping.h" - -// Forward declaration. -struct KernelEnvironmentTy; - -namespace ompx { - -namespace memory { - -/// Alloca \p Size bytes in shared memory, if possible, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_alloc_shared for proper usage. -void *allocShared(uint64_t Size, const char *Reason); - -/// Free \p Ptr, allocated via allocShared, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_free_shared for proper usage. -void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); - -/// Alloca \p Size bytes in global memory, if possible, for \p Reason. -void *allocGlobal(uint64_t Size, const char *Reason); - -/// Return a pointer to the dynamic shared memory buffer. -void *getDynamicBuffer(); - -/// Free \p Ptr, allocated via allocGlobal, for \p Reason. -void freeGlobal(void *Ptr, const char *Reason); - -} // namespace memory - -namespace state { - -inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; - -struct ICVStateTy { - uint32_t NThreadsVar; - uint32_t LevelVar; - uint32_t ActiveLevelVar; - uint32_t Padding0Val; - uint32_t MaxActiveLevelsVar; - uint32_t RunSchedVar; - uint32_t RunSchedChunkVar; - - bool operator==(const ICVStateTy &Other) const; - - void assertEqual(const ICVStateTy &Other) const; -}; - -struct TeamStateTy { - void init(bool IsSPMD); - - bool operator==(const TeamStateTy &) const; - - void assertEqual(TeamStateTy &Other) const; - - /// ICVs - /// - /// Preallocated storage for ICV values that are used if the threads have not - /// set a custom default. The latter is supported but unlikely and slow(er). - /// - ///{ - ICVStateTy ICVState; - ///} - - uint32_t ParallelTeamSize; - uint32_t HasThreadState; - ParallelRegionFnTy ParallelRegionFnVar; -}; - -extern Local<TeamStateTy> TeamState; - -struct ThreadStateTy { - - /// ICVs have preallocated storage in the TeamStateTy which is used if a - /// thread has not set a custom value. The latter is supported but unlikely. - /// When it happens we will allocate dynamic memory to hold the values of all - /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an - /// ICV struct to hold them all. This is slower than alternatives but allows - /// users to pay only for what they use. - /// - state::ICVStateTy ICVState; - - ThreadStateTy *PreviousThreadState; - - void init() { - ICVState = TeamState.ICVState; - PreviousThreadState = nullptr; - } - - void init(ThreadStateTy *PreviousTS) { - ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; - PreviousThreadState = PreviousTS; - } -}; - -extern Local<ThreadStateTy **> ThreadStates; - -/// Initialize the state machinery. Must be called by all threads. -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); - -/// Return the kernel and kernel launch environment associated with the current -/// kernel. The former is static and contains compile time information that -/// holds for all instances of the kernel. The latter is dynamic and provides -/// per-launch information. -KernelEnvironmentTy &getKernelEnvironment(); -KernelLaunchEnvironmentTy &getKernelLaunchEnvironment(); - -/// TODO -enum ValueKind { - VK_NThreads, - VK_Level, - VK_ActiveLevel, - VK_MaxActiveLevels, - VK_RunSched, - // --- - VK_RunSchedChunk, - VK_ParallelRegionFn, - VK_ParallelTeamSize, - VK_HasThreadState, -}; - -/// TODO -void enterDataEnvironment(IdentTy *Ident); - -/// TODO -void exitDataEnvironment(); - -/// TODO -struct DateEnvironmentRAII { - DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); } - ~DateEnvironmentRAII() { exitDataEnvironment(); } -}; - -/// TODO -void resetStateForThread(uint32_t TId); - -// FIXME: https://github.com/llvm/llvm-project/issues/123241. -#define lookupForModify32Impl(Member, Ident, ForceTeamState) \ - { \ - if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() || \ - !TeamState.HasThreadState)) \ - return TeamState.ICVState.Member; \ - uint32_t TId = mapping::getThreadIdInBlock(); \ - if (OMP_UNLIKELY(!ThreadStates[TId])) { \ - ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>( \ - memory::allocGlobal(sizeof(ThreadStateTy), \ - "ICV modification outside data environment")); \ - ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!"); \ - TeamState.HasThreadState = true; \ - ThreadStates[TId]->init(); \ - } \ - return ThreadStates[TId]->ICVState.Member; \ - } - -// FIXME: https://github.com/llvm/llvm-project/issues/123241. -#define lookupImpl(Member, ForceTeamState) \ - { \ - auto TId = mapping::getThreadIdInBlock(); \ - if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() && \ - TeamState.HasThreadState && ThreadStates[TId])) \ - return ThreadStates[TId]->ICVState.Member; \ - return TeamState.ICVState.Member; \ - } - -[[gnu::always_inline, gnu::flatten]] inline uint32_t & -lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { - switch (Kind) { - case state::VK_NThreads: - if (IsReadonly) - lookupImpl(NThreadsVar, ForceTeamState); - lookupForModify32Impl(NThreadsVar, Ident, ForceTeamState); - case state::VK_Level: - if (IsReadonly) - lookupImpl(LevelVar, ForceTeamState); - lookupForModify32Impl(LevelVar, Ident, ForceTeamState); - case state::VK_ActiveLevel: - if (IsReadonly) - lookupImpl(ActiveLevelVar, ForceTeamState); - lookupForModify32Impl(ActiveLevelVar, Ident, ForceTeamState); - case state::VK_MaxActiveLevels: - if (IsReadonly) - lookupImpl(MaxActiveLevelsVar, ForceTeamState); - lookupForModify32Impl(MaxActiveLevelsVar, Ident, ForceTeamState); - case state::VK_RunSched: - if (IsReadonly) - lookupImpl(RunSchedVar, ForceTeamState); - lookupForModify32Impl(RunSchedVar, Ident, ForceTeamState); - case state::VK_RunSchedChunk: - if (IsReadonly) - lookupImpl(RunSchedChunkVar, ForceTeamState); - lookupForModify32Impl(RunSchedChunkVar, Ident, ForceTeamState); - case state::VK_ParallelTeamSize: - return TeamState.ParallelTeamSize; - case state::VK_HasThreadState: - return TeamState.HasThreadState; - default: - break; - } - __builtin_unreachable(); -} - -[[gnu::always_inline, gnu::flatten]] inline void *& -lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) { - switch (Kind) { - case state::VK_ParallelRegionFn: - return TeamState.ParallelRegionFnVar; - default: - break; - } - __builtin_unreachable(); -} - -/// A class without actual state used to provide a nice interface to lookup and -/// update ICV values we can declare in global scope. -template <typename Ty, ValueKind Kind> struct Value { - [[gnu::flatten, gnu::always_inline]] operator Ty() { - return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, - /*ForceTeamState=*/false); - } - - [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) { - set(Other, /*IdentTy=*/nullptr); - return *this; - } - - [[gnu::flatten, gnu::always_inline]] Value &operator++() { - inc(1, /*IdentTy=*/nullptr); - return *this; - } - - [[gnu::flatten, gnu::always_inline]] Value &operator--() { - inc(-1, /*IdentTy=*/nullptr); - return *this; - } - - [[gnu::flatten, gnu::always_inline]] void - assert_eq(const Ty &V, IdentTy *Ident = nullptr, - bool ForceTeamState = false) { - ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr); - } - -private: - [[gnu::flatten, gnu::always_inline]] Ty & - lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { - Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState); - return t; - } - - [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) { - return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) += - UpdateVal); - } - - [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) { - return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) = - UpdateVal); - } - - template <typename VTy, typename Ty2> friend struct ValueRAII; -}; - -/// A mookup class without actual state used to provide -/// a nice interface to lookup and update ICV values -/// we can declare in global scope. -template <typename Ty, ValueKind Kind> struct PtrValue { - [[gnu::flatten, gnu::always_inline]] operator Ty() { - return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, - /*ForceTeamState=*/false); - } - - [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) { - set(Other); - return *this; - } - -private: - Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) { - return lookupPtr(Kind, IsReadonly, ForceTeamState); - } - - Ty &set(Ty UpdateVal) { - return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr, - /*ForceTeamState=*/false) = UpdateVal); - } - - template <typename VTy, typename Ty2> friend struct ValueRAII; -}; - -template <typename VTy, typename Ty> struct ValueRAII { - ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident, - bool ForceTeamState = false) - : Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState) - : (Ty *)utils::UndefPtr), - Val(OldValue), Active(Active) { - if (!Active) - return; - ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!"); - *Ptr = NewValue; - } - ~ValueRAII() { - if (Active) - *Ptr = Val; - } - -private: - Ty *Ptr; - Ty Val; - bool Active; -}; - -/// TODO -inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk; - -/// TODO -inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize; - -/// TODO -inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState; - -/// TODO -inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn> - ParallelRegionFn; - -void runAndCheckState(void(Func(void))); - -void assumeInitialState(bool IsSPMD); - -/// Return the value of the ParallelTeamSize ICV. -int getEffectivePTeamSize(); - -} // namespace state - -namespace icv { - -/// TODO -inline state::Value<uint32_t, state::VK_NThreads> NThreads; - -/// TODO -inline state::Value<uint32_t, state::VK_Level> Level; - -/// The `active-level` describes which of the parallel level counted with the -/// `level-var` is active. There can only be one. -/// -/// active-level-var is 1, if ActiveLevelVar is not 0, otherwise it is 0. -inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel; - -/// TODO -inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels; - -/// TODO -inline state::Value<uint32_t, state::VK_RunSched> RunSched; - -} // namespace icv - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h deleted file mode 100644 index 7e7c8eacb917..000000000000 --- a/offload/DeviceRTL/include/Synchronization.h +++ /dev/null @@ -1,225 +0,0 @@ -//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H -#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H - -#include "DeviceTypes.h" -#include "DeviceUtils.h" - -namespace ompx { -namespace atomic { - -enum OrderingTy { - relaxed = __ATOMIC_RELAXED, - acquire = __ATOMIC_ACQUIRE, - release = __ATOMIC_RELEASE, - acq_rel = __ATOMIC_ACQ_REL, - seq_cst = __ATOMIC_SEQ_CST, -}; - -enum MemScopeTy { - system = __MEMORY_SCOPE_SYSTEM, - device = __MEMORY_SCOPE_DEVICE, - workgroup = __MEMORY_SCOPE_WRKGRP, - wavefront = __MEMORY_SCOPE_WVFRNT, - single = __MEMORY_SCOPE_SINGLE, -}; - -/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device); - -/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The -/// result is stored in \p *Addr; -/// { - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc, - atomic::OrderingTy OrderingFail, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, - OrderingSucc, OrderingFail, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V add(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V load(Ty *Address, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { -#ifdef __NVPTX__ - return __scoped_atomic_fetch_add(Address, V(0), Ordering, MemScope); -#else - return __scoped_atomic_load_n(Address, Ordering, MemScope); -#endif -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -void store(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - __scoped_atomic_store_n(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V mul(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - Ty TypedCurrentVal, TypedResultVal, TypedNewVal; - bool Success; - do { - TypedCurrentVal = atomic::load(Address, Ordering); - TypedNewVal = TypedCurrentVal * Val; - Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, - atomic::relaxed, MemScope); - } while (!Success); - return TypedResultVal; -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<!utils::is_floating_point_v<V>, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, float>, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<float>(max( - (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); - return utils::bitCast<float>(min( - (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, double>, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<double>(max( - (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); - return utils::bitCast<double>(min( - (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<!utils::is_floating_point_v<V>, V> -min(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope); -} - -// TODO: Implement this with __atomic_fetch_max and remove the duplication. -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, float>, V> -min(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<float>(min( - (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); - return utils::bitCast<float>(max( - (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); -} - -// TODO: Implement this with __atomic_fetch_max and remove the duplication. -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, double>, V> -min(Ty *Address, utils::remove_addrspace_t<Ty> Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<double>(min( - (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); - return utils::bitCast<double>(max( - (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope); -} - -static inline uint32_t -atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - uint32_t R; - __scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope); - return R; -} - -///} - -} // namespace atomic - -namespace synchronize { - -/// Initialize the synchronization machinery. Must be called by all threads. -void init(bool IsSPMD); - -/// Synchronize all threads in a warp identified by \p Mask. -void warp(LaneMaskTy Mask); - -/// Synchronize all threads in a block and perform a fence before and after the -/// barrier according to \p Ordering. Note that the fence might be part of the -/// barrier. -void threads(atomic::OrderingTy Ordering); - -/// Synchronizing threads is allowed even if they all hit different instances of -/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more -/// restrictive in that it requires all threads to hit the same instance. The -/// noinline is removed by the openmp-opt pass and helps to preserve the -/// information till then. -///{ - -/// Synchronize all threads in a block, they are reaching the same instruction -/// (hence all threads in the block are "aligned"). Also perform a fence before -/// and after the barrier according to \p Ordering. Note that the -/// fence might be part of the barrier if the target offers this. -[[gnu::noinline, omp::assume("ompx_aligned_barrier")]] void -threadsAligned(atomic::OrderingTy Ordering); - -///} - -} // namespace synchronize - -namespace fence { - -/// Memory fence with \p Ordering semantics for the team. -void team(atomic::OrderingTy Ordering); - -/// Memory fence with \p Ordering semantics for the contention group. -void kernel(atomic::OrderingTy Ordering); - -/// Memory fence with \p Ordering semantics for the system. -void system(atomic::OrderingTy Ordering); - -} // namespace fence - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h deleted file mode 100644 index 554c3271c334..000000000000 --- a/offload/DeviceRTL/include/Workshare.h +++ /dev/null @@ -1,26 +0,0 @@ -//===-------- Workshare.h - OpenMP Workshare interface ------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_WORKSHARE_H -#define OMPTARGET_WORKSHARE_H - -namespace ompx { - -namespace workshare { - -/// Initialize the worksharing machinery. -void init(bool IsSPMD); - -} // namespace workshare - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/generated_microtask_cases.gen b/offload/DeviceRTL/include/generated_microtask_cases.gen deleted file mode 100644 index a05f6da2f84f..000000000000 --- a/offload/DeviceRTL/include/generated_microtask_cases.gen +++ /dev/null @@ -1,797 +0,0 @@ -case 0: -((void (*)(int32_t *, int32_t *))fn)(&global_tid, &bound_tid); -break; -case 1: -((void (*)(int32_t *, int32_t *, void *))fn)(&global_tid, &bound_tid, args[0]); -break; -case 2: -((void (*)(int32_t *, int32_t *, void *, void *))fn)(&global_tid, &bound_tid, - args[0], args[1]); -break; -case 3: -((void (*)(int32_t *, int32_t *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2]); -break; -case 4: -((void (*)(int32_t *, int32_t *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3]); -break; -case 5: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4]); -break; -case 6: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5]); -break; -case 7: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6]); -break; -case 8: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7]); -break; -case 9: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8]); -break; -case 10: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], - args[4], args[5], args[6], - args[7], args[8], args[9]); -break; -case 11: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10]); -break; -case 12: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11]); -break; -case 13: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12]); -break; -case 14: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13]); -break; -case 15: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14]); -break; -case 16: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15]); -break; -case 17: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16]); -break; -case 18: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17]); -break; -case 19: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18]); -break; -case 20: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19]); -break; -case 21: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20]); -break; -case 22: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21]); -break; -case 23: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22]); -break; -case 24: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23]); -break; -case 25: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], - args[21], args[22], args[23], args[24]); -break; -case 26: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25]); -break; -case 27: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26]); -break; -case 28: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27]); -break; -case 29: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28]); -break; -case 30: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29]); -break; -case 31: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30]); -break; -case 32: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], - args[31]); -break; -case 33: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32]); -break; -case 34: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33]); -break; -case 35: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34]); -break; -case 36: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35]); -break; -case 37: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36]); -break; -case 38: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37]); -break; -case 39: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38]); -break; -case 40: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], - args[32], args[33], args[34], args[35], args[36], - args[37], args[38], args[39]); -break; -case 41: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40]); -break; -case 42: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41]); -break; -case 43: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42]); -break; -case 44: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43]); -break; -case 45: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44]); -break; -case 46: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45]); -break; -/// DONE TO HERE -case 47: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], - args[39], args[40], args[41], args[42], args[43], - args[44], args[45], args[46]); -break; -case 48: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47]); -break; -case 49: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48]); -break; -case 50: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49]); -break; -case 51: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50]); -break; -case 52: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51]); -break; -case 53: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52]); -break; -case 54: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53]); -break; -case 55: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54]); -break; -case 56: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], - args[32], args[33], args[34], args[35], args[36], - args[37], args[38], args[39], args[40], args[41], - args[42], args[43], args[44], args[45], args[46], - args[47], args[48], args[49], args[50], args[51], - args[52], args[53], args[54], args[55]); -break; -case 57: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56]); -break; -case 58: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57]); -break; -case 59: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58]); -break; -case 60: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59]); -break; -case 61: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59], args[60]); -break; -case 62: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59], args[60], args[61]); -break; -case 63: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], - args[39], args[40], args[41], args[42], args[43], - args[44], args[45], args[46], args[47], args[48], - args[49], args[50], args[51], args[52], args[53], - args[54], args[55], args[56], args[57], args[58], - args[59], args[60], args[61], args[62]); -break; -case 64: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59], args[60], args[61], - args[62], args[63]); -break; diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp deleted file mode 100644 index aac2a6005158..000000000000 --- a/offload/DeviceRTL/src/Allocator.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Configuration.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Mapping.h" -#include "Synchronization.h" - -using namespace ompx; - -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool; -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy - __omp_rtl_device_memory_pool_tracker; - -/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool -/// directly. -struct BumpAllocatorTy final { - - void *alloc(uint64_t Size) { - Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT)); - - if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) { - atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1, - atomic::seq_cst); - atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size, - atomic::seq_cst); - atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size, - atomic::seq_cst); - atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size, - atomic::seq_cst); - } - - uint64_t *Data = - reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr); - uint64_t End = - reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size; - - uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst); - if (OldData + Size > End) - __builtin_trap(); - - return reinterpret_cast<void *>(OldData); - } - - void free(void *) {} -}; - -BumpAllocatorTy BumpAllocator; - -/// allocator namespace implementation -/// -///{ - -void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) { - // TODO: Check KernelEnvironment for an allocator choice as soon as we have - // more than one. -} - -void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); } - -void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } - -///} diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp deleted file mode 100644 index 0c31c66ab2de..000000000000 --- a/offload/DeviceRTL/src/Configuration.cpp +++ /dev/null @@ -1,85 +0,0 @@ -//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the data object of the constant device environment and the -// query API. -// -//===----------------------------------------------------------------------===// - -#include "Configuration.h" -#include "DeviceTypes.h" -#include "State.h" - -using namespace ompx; - -// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled. -[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_nested_parallelism = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_threads_oversubscription = - 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0; - -// This variable should be visible to the plugin so we override the default -// hidden visibility. -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment; - -uint32_t config::getAssumeTeamsOversubscription() { - return __omp_rtl_assume_teams_oversubscription; -} - -uint32_t config::getAssumeThreadsOversubscription() { - return __omp_rtl_assume_threads_oversubscription; -} - -uint32_t config::getDebugKind() { - return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind; -} - -uint32_t config::getNumDevices() { - return __omp_rtl_device_environment.NumDevices; -} - -uint32_t config::getDeviceNum() { - return __omp_rtl_device_environment.DeviceNum; -} - -uint64_t config::getDynamicMemorySize() { - return __omp_rtl_device_environment.DynamicMemSize; -} - -uint64_t config::getClockFrequency() { - return __omp_rtl_device_environment.ClockFrequency; -} - -void *config::getIndirectCallTablePtr() { - return reinterpret_cast<void *>( - __omp_rtl_device_environment.IndirectCallTable); -} - -uint64_t config::getHardwareParallelism() { - return __omp_rtl_device_environment.HardwareParallelism; -} - -uint64_t config::getIndirectCallTableSize() { - return __omp_rtl_device_environment.IndirectCallTableSize; -} - -bool config::isDebugMode(DeviceDebugKind Kind) { - return config::getDebugKind() & uint32_t(Kind); -} - -bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } - -bool config::mayUseNestedParallelism() { - if (__omp_rtl_assume_no_nested_parallelism) - return false; - return state::getKernelEnvironment().Configuration.MayUseNestedParallelism; -} diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp deleted file mode 100644 index 5b5482d766b1..000000000000 --- a/offload/DeviceRTL/src/Debug.cpp +++ /dev/null @@ -1,44 +0,0 @@ -//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains debug utilities -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Configuration.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" - -using namespace ompx; - -extern "C" { -void __assert_assume(bool condition) { __builtin_assume(condition); } - -#ifndef OMPTARGET_HAS_LIBC -[[gnu::weak]] void __assert_fail(const char *expr, const char *file, - unsigned line, const char *function) { - __assert_fail_internal(expr, nullptr, file, line, function); -} -#endif - -void __assert_fail_internal(const char *expr, const char *msg, const char *file, - unsigned line, const char *function) { - if (msg) { - printf("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function, - msg, expr); - } else { - printf("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr); - } - __builtin_trap(); -} -} diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp deleted file mode 100644 index d6f8c499c890..000000000000 --- a/offload/DeviceRTL/src/DeviceUtils.cpp +++ /dev/null @@ -1,64 +0,0 @@ -//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "DeviceUtils.h" - -#include "Debug.h" -#include "Interface.h" -#include "Mapping.h" -#include "gpuintrin.h" - -using namespace ompx; - -uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { - return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; -} - -void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { - static_assert(sizeof(unsigned long) == 8, ""); - LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL); - HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32); -} - -int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, - int32_t Width) { - return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width); -} - -int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, - int32_t Width) { - int32_t Self = mapping::getThreadIdInWarp(); - int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; - return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); -} - -int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, - int32_t Width) { - int32_t Self = mapping::getThreadIdInWarp(); - int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; - return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); -} - -uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { - return __gpu_ballot(Mask, Pred); -} - -bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); } - -extern "C" { -int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { - return utils::shuffleDown(lanes::All, Val, Delta, SrcLane); -} - -int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { - return utils::shuffleDown(lanes::All, Val, Delta, Width); -} -} diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp deleted file mode 100644 index 467e44a65276..000000000000 --- a/offload/DeviceRTL/src/Kernel.cpp +++ /dev/null @@ -1,162 +0,0 @@ -//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the kernel entry points for the device. -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" -#include "Workshare.h" - -using namespace ompx; - -// These flags are copied from "llvm/Frontend/OpenMP/OMPDeviceConstants.h" and -// must be kept in-sync. -enum OMPTgtExecModeFlags : unsigned char { - OMP_TGT_EXEC_MODE_BARE = 0, - OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, - OMP_TGT_EXEC_MODE_SPMD = 1 << 1, - OMP_TGT_EXEC_MODE_GENERIC_SPMD = - OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD -}; - -static void -inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - // Order is important here. - synchronize::init(IsSPMD); - mapping::init(IsSPMD); - state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment); - allocator::init(IsSPMD, KernelEnvironment); - workshare::init(IsSPMD); -} - -/// Simple generic state machine for worker threads. -static void genericStateMachine(IdentTy *Ident) { - uint32_t TId = mapping::getThreadIdInBlock(); - - do { - ParallelRegionFnTy WorkFn = nullptr; - - // Wait for the signal that we have a new work function. - synchronize::threads(atomic::seq_cst); - - // Retrieve the work function from the runtime. - bool IsActive = __kmpc_kernel_parallel(&WorkFn); - - // If there is nothing more to do, break out of the state machine by - // returning to the caller. - if (!WorkFn) - return; - - if (IsActive) { - ASSERT(!mapping::isSPMDMode(), nullptr); - ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId); - __kmpc_kernel_end_parallel(); - } - - synchronize::threads(atomic::seq_cst); - - } while (true); -} - -extern "C" { - -/// Initialization -/// -/// \param Ident Source location identification, can be NULL. -/// -int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; - bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD; - bool UseGenericStateMachine = Configuration.UseGenericStateMachine; - if (IsSPMD) { - inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, - KernelLaunchEnvironment); - synchronize::threadsAligned(atomic::relaxed); - } else { - inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, - KernelLaunchEnvironment); - // No need to wait since only the main threads will execute user - // code and workers will run into a barrier right away. - } - - if (IsSPMD) { - state::assumeInitialState(IsSPMD); - - // Synchronize to ensure the assertions above are in an aligned region. - // The barrier is eliminated later. - synchronize::threadsAligned(atomic::relaxed); - return -1; - } - - if (mapping::isInitialThreadInLevel0(IsSPMD)) - return -1; - - // Enter the generic state machine if enabled and if this thread can possibly - // be an active worker thread. - // - // The latter check is important for NVIDIA Pascal (but not Volta) and AMD - // GPU. In those cases, a single thread can apparently satisfy a barrier on - // behalf of all threads in the same warp. Thus, it would not be safe for - // other threads in the main thread's warp to reach the first - // synchronize::threads call in genericStateMachine before the main thread - // reaches its corresponding synchronize::threads call: that would permit all - // active worker threads to proceed before the main thread has actually set - // state::ParallelRegionFn, and then they would immediately quit without - // doing any work. mapping::getMaxTeamThreads() does not include any of the - // main thread's warp, so none of its threads can ever be active worker - // threads. - if (UseGenericStateMachine && - mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) - genericStateMachine(KernelEnvironment.Ident); - - return mapping::getThreadIdInBlock(); -} - -/// De-Initialization -/// -/// In non-SPMD, this function releases the workers trapped in a state machine -/// and also any memory dynamically allocated by the runtime. -/// -/// \param Ident Source location identification, can be NULL. -/// -void __kmpc_target_deinit() { - bool IsSPMD = mapping::isSPMDMode(); - if (IsSPMD) - return; - - if (mapping::isInitialThreadInLevel0(IsSPMD)) { - // Signal the workers to exit the state machine and exit the kernel. - state::ParallelRegionFn = nullptr; - } else if (!state::getKernelEnvironment() - .Configuration.UseGenericStateMachine) { - // Retrieve the work function just to ensure we always call - // __kmpc_kernel_parallel even if a custom state machine is used. - // TODO: this is not super pretty. The problem is we create the call to - // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it - // is not there yet. Thus, we assume we never reach it from - // __kmpc_target_deinit. That allows us to remove the store in there to - // ParallelRegionFn, which leads to bad results later on. - ParallelRegionFnTy WorkFn = nullptr; - __kmpc_kernel_parallel(&WorkFn); - ASSERT(WorkFn == nullptr, nullptr); - } -} - -int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } -} diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp deleted file mode 100644 index 83f9233d9480..000000000000 --- a/offload/DeviceRTL/src/LibC.cpp +++ /dev/null @@ -1,48 +0,0 @@ -//===------- LibC.cpp - Simple implementation of libc functions --- C++ ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "LibC.h" - -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) -extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; } -#else -extern "C" int vprintf(const char *format, __builtin_va_list); -#endif - -extern "C" { -[[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) { - auto *L = reinterpret_cast<const unsigned char *>(lhs); - auto *R = reinterpret_cast<const unsigned char *>(rhs); - - for (size_t I = 0; I < count; ++I) - if (L[I] != R[I]) - return (int)L[I] - (int)R[I]; - - return 0; -} - -[[gnu::weak]] void memset(void *dst, int C, size_t count) { - auto *dstc = reinterpret_cast<char *>(dst); - for (size_t I = 0; I < count; ++I) - dstc[I] = C; -} - -[[gnu::weak]] int printf(const char *Format, ...) { - __builtin_va_list vlist; - __builtin_va_start(vlist, Format); - return ::vprintf(Format, vlist); -} -} - -namespace ompx { -[[clang::no_builtin("printf")]] int printf(const char *Format, ...) { - __builtin_va_list vlist; - __builtin_va_start(vlist, Format); - return ::vprintf(Format, vlist); -} -} // namespace ompx diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp deleted file mode 100644 index b145892d1ece..000000000000 --- a/offload/DeviceRTL/src/Mapping.cpp +++ /dev/null @@ -1,212 +0,0 @@ -//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "Mapping.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "State.h" -#include "gpuintrin.h" - -using namespace ompx; - -// FIXME: This resolves the handling for the AMDGPU workgroup size when the ABI -// is set to 'none'. We only support COV5+ but this can be removed when COV4 is -// fully deprecated. -#ifdef __AMDGPU__ -extern const inline uint32_t __oclc_ABI_version = 500; -[[gnu::alias("__oclc_ABI_version")]] const uint32_t __oclc_ABI_version__; -#endif - -static bool isInLastWarp() { - uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) & - ~(mapping::getWarpSize() - 1); - return mapping::getThreadIdInBlock() == MainTId; -} - -bool mapping::isMainThreadInGenericMode(bool IsSPMD) { - if (IsSPMD || icv::Level) - return false; - - // Check if this is the last warp in the block. - return isInLastWarp(); -} - -bool mapping::isMainThreadInGenericMode() { - return mapping::isMainThreadInGenericMode(mapping::isSPMDMode()); -} - -bool mapping::isInitialThreadInLevel0(bool IsSPMD) { - if (IsSPMD) - return mapping::getThreadIdInBlock() == 0; - return isInLastWarp(); -} - -bool mapping::isLeaderInWarp() { - __kmpc_impl_lanemask_t Active = mapping::activemask(); - __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); - return utils::popc(Active & LaneMaskLT) == 0; -} - -LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); } - -LaneMaskTy mapping::lanemaskLT() { -#ifdef __NVPTX__ - return __nvvm_read_ptx_sreg_lanemask_lt(); -#else - uint32_t Lane = mapping::getThreadIdInWarp(); - int64_t Ballot = mapping::activemask(); - uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; - return Mask & Ballot; -#endif -} - -LaneMaskTy mapping::lanemaskGT() { -#ifdef __NVPTX__ - return __nvvm_read_ptx_sreg_lanemask_gt(); -#else - uint32_t Lane = mapping::getThreadIdInWarp(); - if (Lane == (mapping::getWarpSize() - 1)) - return 0; - int64_t Ballot = mapping::activemask(); - uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); - return Mask & Ballot; -#endif -} - -uint32_t mapping::getThreadIdInWarp() { - uint32_t ThreadIdInWarp = __gpu_lane_id(); - ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr); - return ThreadIdInWarp; -} - -uint32_t mapping::getThreadIdInBlock(int32_t Dim) { - uint32_t ThreadIdInBlock = __gpu_thread_id(Dim); - return ThreadIdInBlock; -} - -uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); } - -uint32_t mapping::getMaxTeamThreads(bool IsSPMD) { - uint32_t BlockSize = mapping::getNumberOfThreadsInBlock(); - // If we are in SPMD mode, remove one warp. - return BlockSize - (!IsSPMD * mapping::getWarpSize()); -} -uint32_t mapping::getMaxTeamThreads() { - return mapping::getMaxTeamThreads(mapping::isSPMDMode()); -} - -uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) { - return __gpu_num_threads(Dim); -} - -uint32_t mapping::getNumberOfThreadsInKernel() { - return mapping::getNumberOfThreadsInBlock(0) * - mapping::getNumberOfBlocksInKernel(0) * - mapping::getNumberOfThreadsInBlock(1) * - mapping::getNumberOfBlocksInKernel(1) * - mapping::getNumberOfThreadsInBlock(2) * - mapping::getNumberOfBlocksInKernel(2); -} - -uint32_t mapping::getWarpIdInBlock() { - uint32_t WarpID = - mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); - ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr); - return WarpID; -} - -uint32_t mapping::getBlockIdInKernel(int32_t Dim) { - uint32_t BlockId = __gpu_block_id(Dim); - ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr); - return BlockId; -} - -uint32_t mapping::getNumberOfWarpsInBlock() { - return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); -} - -uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) { - return __gpu_num_blocks(Dim); -} - -uint32_t mapping::getNumberOfProcessorElements() { - return static_cast<uint32_t>(config::getHardwareParallelism()); -} - -///} - -/// Execution mode -/// -///{ - -// TODO: This is a workaround for initialization coming from kernels outside of -// the TU. We will need to solve this more correctly in the future. -[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode; - -void mapping::init(bool IsSPMD) { - if (mapping::isInitialThreadInLevel0(IsSPMD)) - IsSPMDMode = IsSPMD; -} - -bool mapping::isSPMDMode() { return IsSPMDMode; } - -bool mapping::isGenericMode() { return !isSPMDMode(); } -///} - -extern "C" { -[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() { - return mapping::getThreadIdInBlock(); -} - -[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() { - return mapping::getNumberOfThreadsInBlock(mapping::DIM_X); -} - -[[gnu::noinline]] uint32_t __kmpc_get_warp_size() { - return mapping::getWarpSize(); -} -} - -#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \ - extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); } - -_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock) -_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel) -_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock) -_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel) - -extern "C" { -uint64_t ompx_ballot_sync(uint64_t mask, int pred) { - return utils::ballotSync(mask, pred); -} - -int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) { - return utils::shuffleDown(mask, var, delta, width); -} - -float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta, - int width) { - return utils::bitCast<float>( - utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width)); -} - -long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) { - return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width); -} - -double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta, - int width) { - return utils::bitCast<double>( - utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width)); -} -} diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp deleted file mode 100644 index a89f8b2a7453..000000000000 --- a/offload/DeviceRTL/src/Misc.cpp +++ /dev/null @@ -1,138 +0,0 @@ -//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "Allocator.h" -#include "Configuration.h" -#include "DeviceTypes.h" -#include "Shared/RPCOpcodes.h" -#include "shared/rpc.h" - -#include "Debug.h" - -namespace ompx { -namespace impl { - -/// Lookup a device-side function using a host pointer /p HstPtr using the table -/// provided by the device plugin. The table is an ordered pair of host and -/// device pointers sorted on the value of the host pointer. -void *indirectCallLookup(void *HstPtr) { - if (!HstPtr) - return nullptr; - - struct IndirectCallTable { - void *HstPtr; - void *DevPtr; - }; - IndirectCallTable *Table = - reinterpret_cast<IndirectCallTable *>(config::getIndirectCallTablePtr()); - uint64_t TableSize = config::getIndirectCallTableSize(); - - // If the table is empty we assume this is device pointer. - if (!Table || !TableSize) - return HstPtr; - - uint32_t Left = 0; - uint32_t Right = TableSize; - - // If the pointer is definitely not contained in the table we exit early. - if (HstPtr < Table[Left].HstPtr || HstPtr > Table[Right - 1].HstPtr) - return HstPtr; - - while (Left != Right) { - uint32_t Current = Left + (Right - Left) / 2; - if (Table[Current].HstPtr == HstPtr) - return Table[Current].DevPtr; - - if (HstPtr < Table[Current].HstPtr) - Right = Current; - else - Left = Current; - } - - // If we searched the whole table and found nothing this is a device pointer. - return HstPtr; -} - -/// The openmp client instance used to communicate with the server. -[[gnu::visibility("protected"), - gnu::weak]] rpc::Client Client asm("__llvm_rpc_client"); - -} // namespace impl -} // namespace ompx - -/// Interfaces -/// -///{ - -extern "C" { -int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; } - -int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } - -double omp_get_wtick(void) { - // The number of ticks per second for the AMDGPU clock varies by card and can - // only be retrieved by querying the driver. We rely on the device environment - // to inform us what the proper frequency is. NVPTX uses a nanosecond - // resolution, we could omit the global read but this makes it consistent. - return 1.0 / ompx::config::getClockFrequency(); -} - -double omp_get_wtime(void) { - return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick(); -} - -void *__llvm_omp_indirect_call_lookup(void *HstPtr) { - return ompx::impl::indirectCallLookup(HstPtr); -} - -void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { - switch (allocator) { - case omp_default_mem_alloc: - case omp_large_cap_mem_alloc: - case omp_const_mem_alloc: - case omp_high_bw_mem_alloc: - case omp_low_lat_mem_alloc: - return malloc(size); - default: - return nullptr; - } -} - -void omp_free(void *ptr, omp_allocator_handle_t allocator) { - switch (allocator) { - case omp_default_mem_alloc: - case omp_large_cap_mem_alloc: - case omp_const_mem_alloc: - case omp_high_bw_mem_alloc: - case omp_low_lat_mem_alloc: - free(ptr); - case omp_null_allocator: - default: - return; - } -} - -unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) { - rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_HOST_CALL>(); - Port.send_n(data, size); - Port.send([=](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = reinterpret_cast<uintptr_t>(fn); - }); - unsigned long long Ret; - Port.recv([&](rpc::Buffer *Buffer, uint32_t) { - Ret = static_cast<unsigned long long>(Buffer->data[0]); - }); - Port.close(); - return Ret; -} -} - -///} diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp deleted file mode 100644 index 08ce616aee1c..000000000000 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ /dev/null @@ -1,311 +0,0 @@ -//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Parallel implementation in the GPU. Here is the pattern: -// -// while (not finished) { -// -// if (master) { -// sequential code, decide which par loop to do, or if finished -// __kmpc_kernel_prepare_parallel() // exec by master only -// } -// syncthreads // A -// __kmpc_kernel_parallel() // exec by all -// if (this thread is included in the parallel) { -// switch () for all parallel loops -// __kmpc_kernel_end_parallel() // exec only by threads in parallel -// } -// -// -// The reason we don't exec end_parallel for the threads not included -// in the parallel loop is that for each barrier in the parallel -// region, these non-included threads will cycle through the -// syncthread A. Thus they must preserve their current threadId that -// is larger than thread in team. -// -// To make a long story short... -// -//===----------------------------------------------------------------------===// - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -namespace { - -uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { - uint32_t NThreadsICV = - NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; - uint32_t NumThreads = mapping::getMaxTeamThreads(); - - if (NThreadsICV != 0 && NThreadsICV < NumThreads) - NumThreads = NThreadsICV; - - // SPMD mode allows any number of threads, for generic mode we round down to a - // multiple of WARPSIZE since it is legal to do so in OpenMP. - if (mapping::isSPMDMode()) - return NumThreads; - - if (NumThreads < mapping::getWarpSize()) - NumThreads = 1; - else - NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); - - return NumThreads; -} - -// Invoke an outlined parallel function unwrapping arguments (up to 32). -[[clang::always_inline]] void invokeMicrotask(int32_t global_tid, - int32_t bound_tid, void *fn, - void **args, int64_t nargs) { - switch (nargs) { -#include "generated_microtask_cases.gen" - default: - printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); - __builtin_trap(); - } -} - -} // namespace - -extern "C" { - -[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, - int32_t num_threads, - void *fn, void **args, - const int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); - uint32_t NumThreads = determineNumberOfThreads(num_threads); - uint32_t PTeamSize = - NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; - // Avoid the race between the read of the `icv::Level` above and the write - // below by synchronizing all threads here. - synchronize::threadsAligned(atomic::seq_cst); - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - - // Synchronize all threads after the main thread (TId == 0) set up the - // team state properly. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(PTeamSize, ident, - /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize before we run user code to avoid invalidating the - // assumptions above. - synchronize::threadsAligned(atomic::relaxed); - - if (!PTeamSize || TId < PTeamSize) - invokeMicrotask(TId, 0, fn, args, nargs); - - // Synchronize all threads at the end of a parallel region. - synchronize::threadsAligned(atomic::seq_cst); - } - - // Synchronize all threads to make sure every thread exits the scope above; - // otherwise the following assertions and the assumption in - // __kmpc_target_deinit may not hold. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize to create an aligned region around the assumptions. - synchronize::threadsAligned(atomic::relaxed); - - return; -} - -[[clang::always_inline]] void -__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, - int32_t num_threads, int proc_bind, void *fn, - void *wrapper_fn, void **args, int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); - - // Assert the parallelism level is zero if disabled by the user. - ASSERT((config::mayUseNestedParallelism() || icv::Level == 0), - "nested parallelism while disabled"); - - // Handle the serialized case first, same for SPMD/non-SPMD: - // 1) if-clause(0) - // 2) parallel in task or other thread state inducing construct - // 3) nested parallel regions - if (OMP_UNLIKELY(!if_expr || state::HasThreadState || - (config::mayUseNestedParallelism() && icv::Level))) { - state::DateEnvironmentRAII DERAII(ident); - ++icv::Level; - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - // From this point forward we know that there is no thread state used. - ASSERT(state::HasThreadState == false, nullptr); - - if (mapping::isSPMDMode()) { - // This was moved to its own routine so it could be called directly - // in certain situations to avoid resource consumption of unused - // logic in parallel_51. - __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs); - - return; - } - - uint32_t NumThreads = determineNumberOfThreads(num_threads); - uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); - uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; - - // We do *not* create a new data environment because all threads in the team - // that are active are now running this parallel region. They share the - // TeamState, which has an increase level-var and potentially active-level - // set, but they do not have individual ThreadStates yet. If they ever - // modify the ICVs beyond this point a ThreadStates will be allocated. - - bool IsActiveParallelRegion = NumThreads > 1; - if (!IsActiveParallelRegion) { - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - void **GlobalArgs = nullptr; - if (nargs) { - __kmpc_begin_sharing_variables(&GlobalArgs, nargs); - switch (nargs) { - default: - for (int I = 0; I < nargs; I++) - GlobalArgs[I] = args[I]; - break; - case 16: - GlobalArgs[15] = args[15]; - [[fallthrough]]; - case 15: - GlobalArgs[14] = args[14]; - [[fallthrough]]; - case 14: - GlobalArgs[13] = args[13]; - [[fallthrough]]; - case 13: - GlobalArgs[12] = args[12]; - [[fallthrough]]; - case 12: - GlobalArgs[11] = args[11]; - [[fallthrough]]; - case 11: - GlobalArgs[10] = args[10]; - [[fallthrough]]; - case 10: - GlobalArgs[9] = args[9]; - [[fallthrough]]; - case 9: - GlobalArgs[8] = args[8]; - [[fallthrough]]; - case 8: - GlobalArgs[7] = args[7]; - [[fallthrough]]; - case 7: - GlobalArgs[6] = args[6]; - [[fallthrough]]; - case 6: - GlobalArgs[5] = args[5]; - [[fallthrough]]; - case 5: - GlobalArgs[4] = args[4]; - [[fallthrough]]; - case 4: - GlobalArgs[3] = args[3]; - [[fallthrough]]; - case 3: - GlobalArgs[2] = args[2]; - [[fallthrough]]; - case 2: - GlobalArgs[1] = args[1]; - [[fallthrough]]; - case 1: - GlobalArgs[0] = args[0]; - [[fallthrough]]; - case 0: - break; - } - } - - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, - (void *)nullptr, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - - // Master signals work to activate workers. - synchronize::threads(atomic::seq_cst); - // Master waits for workers to signal. - synchronize::threads(atomic::seq_cst); - } - - if (nargs) - __kmpc_end_sharing_variables(); -} - -[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { - // Work function and arguments for L1 parallel region. - *WorkFn = state::ParallelRegionFn; - - // If this is the termination signal from the master, quit early. - if (!*WorkFn) - return false; - - // Set to true for workers participating in the parallel region. - uint32_t TId = mapping::getThreadIdInBlock(); - bool ThreadIsActive = TId < state::getEffectivePTeamSize(); - return ThreadIsActive; -} - -[[clang::noinline]] void __kmpc_kernel_end_parallel() { - // In case we have modified an ICV for this thread before a ThreadState was - // created. We drop it now to not contaminate the next parallel region. - ASSERT(!mapping::isSPMDMode(), nullptr); - uint32_t TId = mapping::getThreadIdInBlock(); - state::resetStateForThread(TId); - ASSERT(!mapping::isSPMDMode(), nullptr); -} - -uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } - -int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } - -void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, - int32_t thread_limit) {} - -void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} -} diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp deleted file mode 100644 index df141af5ebee..000000000000 --- a/offload/DeviceRTL/src/Profiling.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===------- Profiling.cpp ---------------------------------------- C++ ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "Profiling.h" - -extern "C" { - -// Provides empty implementations for certain functions in compiler-rt -// that are emitted by the PGO instrumentation. -void __llvm_profile_register_function(void *Ptr) {} -void __llvm_profile_register_names_function(void *Ptr, long int I) {} -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} -} diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp deleted file mode 100644 index fffd0063940c..000000000000 --- a/offload/DeviceRTL/src/Reduction.cpp +++ /dev/null @@ -1,316 +0,0 @@ -//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of reduction with KMPC interface. -// -//===----------------------------------------------------------------------===// - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -namespace { - -void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { - for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { - shflFct(reduce_data, /*LaneId - not used= */ 0, - /*Offset = */ mask, /*AlgoVersion=*/0); - } -} - -void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, - uint32_t size, uint32_t tid) { - uint32_t curr_size; - uint32_t mask; - curr_size = size; - mask = curr_size / 2; - while (mask > 0) { - shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); - curr_size = (curr_size + 1) / 2; - mask = curr_size / 2; - } -} - -static uint32_t gpu_irregular_simd_reduce(void *reduce_data, - ShuffleReductFnTy shflFct) { - uint32_t size, remote_id, physical_lane_id; - physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); - __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); - __kmpc_impl_lanemask_t Liveness = mapping::activemask(); - uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; - __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); - do { - Liveness = mapping::activemask(); - remote_id = utils::ffs(Liveness & lanemask_gt); - size = utils::popc(Liveness); - logical_lane_id /= 2; - shflFct(reduce_data, /*LaneId =*/logical_lane_id, - /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); - } while (logical_lane_id % 2 == 0 && size > 1); - return (logical_lane_id == 0); -} - -static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { - uint32_t BlockThreadId = mapping::getThreadIdInBlock(); - if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false)) - BlockThreadId = 0; - uint32_t NumThreads = omp_get_num_threads(); - if (NumThreads == 1) - return 1; - - // - // This reduce function handles reduction within a team. It handles - // parallel regions in both L1 and L2 parallelism levels. It also - // supports Generic, SPMD, and NoOMP modes. - // - // 1. Reduce within a warp. - // 2. Warp master copies value to warp 0 via shared memory. - // 3. Warp 0 reduces to a single value. - // 4. The reduced value is available in the thread that returns 1. - // - -#if __has_builtin(__nvvm_reflect) - if (__nvvm_reflect("__CUDA_ARCH") >= 700) { - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - uint32_t WarpId = mapping::getWarpIdInBlock(); - - // Volta execution model: - // For the Generic execution mode a parallel region either has 1 thread and - // beyond that, always a multiple of 32. For the SPMD execution mode we may - // have any number of threads. - if ((NumThreads % mapping::getWarpSize() == 0) || - (WarpId < WarpsNeeded - 1)) - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce( - reduce_data, shflFct, - /*LaneCount=*/NumThreads % mapping::getWarpSize(), - /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - } - return BlockThreadId == 0; - } -#endif - __kmpc_impl_lanemask_t Liveness = mapping::activemask(); - if (Liveness == lanes::All) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/utils::popc(Liveness), - /*LaneId=*/mapping::getThreadIdInBlock() % - mapping::getWarpSize()); - else { // Dispersed lanes. Only threads in L2 - // parallel region may enter here; return - // early. - return gpu_irregular_simd_reduce(reduce_data, shflFct); - } - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - - return BlockThreadId == 0; - } - - // Get the OMP thread Id. This is different from BlockThreadId in the case - // of an L2 parallel region. - return BlockThreadId == 0; -} - -uint32_t roundToWarpsize(uint32_t s) { - if (s < mapping::getWarpSize()) - return 1; - return (s & ~(unsigned)(mapping::getWarpSize() - 1)); -} - -uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } - -} // namespace - -extern "C" { -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, - uint64_t reduce_data_size, - void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { - return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); -} - -int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, - uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, - ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) { - // Terminate all threads in non-SPMD mode except for the master thread. - uint32_t ThreadId = mapping::getThreadIdInBlock(); - if (mapping::isGenericMode()) { - if (!mapping::isMainThreadInGenericMode()) - return 0; - ThreadId = 0; - } - - uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt; - uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; - - // In non-generic mode all workers participate in the teams reduction. - // In generic mode only the team master participates in the teams - // reduction because the workers are waiting for parallel work. - uint32_t NumThreads = omp_get_num_threads(); - uint32_t TeamId = omp_get_team_num(); - uint32_t NumTeams = omp_get_num_teams(); - [[clang::loader_uninitialized]] static Local<unsigned> Bound; - [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount; - - // Block progress for teams greater than the current upper - // limit. We always only allow a number of teams less or equal - // to the number of slots in the buffer. - bool IsMaster = (ThreadId == 0); - while (IsMaster) { - Bound = atomic::load(&IterCnt, atomic::acquire); - if (TeamId < Bound + num_of_records) - break; - } - - if (IsMaster) { - int ModBockId = TeamId % num_of_records; - if (TeamId < num_of_records) { - lgcpyFct(GlobalBuffer, ModBockId, reduce_data); - } else - lgredFct(GlobalBuffer, ModBockId, reduce_data); - - // Propagate the memory writes above to the world. - fence::kernel(atomic::release); - - // Increment team counter. - // This counter is incremented by all teams in the current - // num_of_records chunk. - ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, - atomic::MemScopeTy::device); - } - - // Synchronize in SPMD mode as in generic mode all but 1 threads are in the - // state machine. - if (mapping::isSPMDMode()) - synchronize::threadsAligned(atomic::acq_rel); - - // reduce_data is global or shared so before being reduced within the - // warp we need to bring it in local memory: - // local_reduce_data = reduce_data[i] - // - // Example for 3 reduction variables a, b, c (of potentially different - // types): - // - // buffer layout (struct of arrays): - // a, a, ..., a, b, b, ... b, c, c, ... c - // |__________| - // num_of_records - // - // local_data_reduce layout (struct): - // a, b, c - // - // Each thread will have a local struct containing the values to be - // reduced: - // 1. do reduction within each warp. - // 2. do reduction across warps. - // 3. write the final result to the main reduction variable - // by returning 1 in the thread holding the reduction result. - - // Check if this is the very last team. - unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); - if (ChunkTeamCount == NumTeams - Bound - 1) { - // Ensure we see the global memory writes by other teams - fence::kernel(atomic::acquire); - - // - // Last team processing. - // - if (ThreadId >= NumRecs) - return 0; - NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); - if (ThreadId >= NumThreads) - return 0; - - // Load from buffer and reduce. - glcpyFct(GlobalBuffer, ThreadId, reduce_data); - for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) - glredFct(GlobalBuffer, i, reduce_data); - - // Reduce across warps to the warp master. - if (NumThreads > 1) { - gpu_regular_warp_reduce(reduce_data, shflFct); - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); - if (ActiveThreads > mapping::getWarpSize()) { - uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = ThreadId / mapping::getWarpSize(); - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - ThreadId); - } - } - - if (IsMaster) { - Cnt = 0; - IterCnt = 0; - return 1; - } - return 0; - } - if (IsMaster && ChunkTeamCount == num_of_records - 1) { - // Allow SIZE number of teams to proceed writing their - // intermediate results to the global buffer. - atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst); - } - - return 0; -} -} - -void *__kmpc_reduction_get_fixed_buffer() { - return state::getKernelLaunchEnvironment().ReductionBuffer; -} diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp deleted file mode 100644 index 475395102f47..000000000000 --- a/offload/DeviceRTL/src/State.cpp +++ /dev/null @@ -1,482 +0,0 @@ -//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Configuration.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -/// Memory implementation -/// -///{ - -/// External symbol to access dynamic shared memory. -[[gnu::aligned( - allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[]; - -/// The kernel environment passed to the init method by the compiler. -[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *> - KernelEnvironmentPtr; - -/// The kernel launch environment passed as argument to the kernel by the -/// runtime. -[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *> - KernelLaunchEnvironmentPtr; - -///} - -namespace { - -/// Fallback implementations are missing to trigger a link time error. -/// Implementations for new devices, including the host, should go into a -/// dedicated begin/end declare variant. -/// -///{ -extern "C" { -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) - -[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } -[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } - -#else - -[[gnu::weak, gnu::leaf]] void *malloc(size_t Size); -[[gnu::weak, gnu::leaf]] void free(void *Ptr); - -#endif -} -///} - -/// A "smart" stack in shared memory. -/// -/// The stack exposes a malloc/free interface but works like a stack internally. -/// In fact, it is a separate stack *per warp*. That means, each warp must push -/// and pop symmetrically or this breaks, badly. The implementation will (aim -/// to) detect non-lock-step warps and fallback to malloc/free. The same will -/// happen if a warp runs out of memory. The master warp in generic memory is -/// special and is given more memory than the rest. -/// -struct SharedMemorySmartStackTy { - /// Initialize the stack. Must be called by all threads. - void init(bool IsSPMD); - - /// Allocate \p Bytes on the stack for the encountering thread. Each thread - /// can call this function. - void *push(uint64_t Bytes); - - /// Deallocate the last allocation made by the encountering thread and pointed - /// to by \p Ptr from the stack. Each thread can call this function. - void pop(void *Ptr, uint64_t Bytes); - -private: - /// Compute the size of the storage space reserved for a thread. - uint32_t computeThreadStorageTotal() { - uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); - return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock, - allocator::ALIGNMENT); - } - - /// Return the top address of the warp data stack, that is the first address - /// this warp will allocate memory at next. - void *getThreadDataTop(uint32_t TId) { - return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; - } - - /// The actual storage, shared among all warps. - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize]; - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; -}; - -static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, - "Shared scratchpad of this size not supported yet."); - -/// The allocation of a single shared memory scratchpad. -[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy> - SharedMemorySmartStack; - -void SharedMemorySmartStackTy::init(bool IsSPMD) { - Usage[mapping::getThreadIdInBlock()] = 0; -} - -void *SharedMemorySmartStackTy::push(uint64_t Bytes) { - // First align the number of requested bytes. - /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to - /// be passed in as an argument and the stack rewritten to support it. - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - - uint32_t StorageTotal = computeThreadStorageTotal(); - - // The main thread in generic mode gets the space of its entire warp as the - // other threads do not participate in any computation at all. - if (mapping::isMainThreadInGenericMode()) - StorageTotal *= mapping::getWarpSize(); - - int TId = mapping::getThreadIdInBlock(); - if (Usage[TId] + AlignedBytes <= StorageTotal) { - void *Ptr = getThreadDataTop(TId); - Usage[TId] += AlignedBytes; - return Ptr; - } - - if (config::isDebugMode(DeviceDebugKind::CommonIssues)) - printf("Shared memory stack full, fallback to dynamic allocation of global " - "memory will negatively impact performance.\n"); - void *GlobalMemory = memory::allocGlobal( - AlignedBytes, "Slow path shared memory allocation, insufficient " - "shared memory stack memory!"); - ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!"); - - return GlobalMemory; -} - -void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - if (utils::isSharedMemPtr(Ptr)) { - int TId = mapping::getThreadIdInBlock(); - Usage[TId] -= AlignedBytes; - return; - } - memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); -} - -} // namespace - -void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } - -void *memory::allocShared(uint64_t Bytes, const char *Reason) { - return SharedMemorySmartStack.push(Bytes); -} - -void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { - SharedMemorySmartStack.pop(Ptr, Bytes); -} - -void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { - void *Ptr = malloc(Bytes); - if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) - printf("nullptr returned by malloc!\n"); - return Ptr; -} - -void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } - -///} - -bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { - return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & - (ActiveLevelVar == Other.ActiveLevelVar) & - (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & - (RunSchedVar == Other.RunSchedVar) & - (RunSchedChunkVar == Other.RunSchedChunkVar); -} - -void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { - ASSERT(NThreadsVar == Other.NThreadsVar, nullptr); - ASSERT(LevelVar == Other.LevelVar, nullptr); - ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr); - ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr); - ASSERT(RunSchedVar == Other.RunSchedVar, nullptr); - ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr); -} - -void state::TeamStateTy::init(bool IsSPMD) { - ICVState.NThreadsVar = 0; - ICVState.LevelVar = 0; - ICVState.ActiveLevelVar = 0; - ICVState.Padding0Val = 0; - ICVState.MaxActiveLevelsVar = 1; - ICVState.RunSchedVar = omp_sched_static; - ICVState.RunSchedChunkVar = 1; - ParallelTeamSize = 1; - HasThreadState = false; - ParallelRegionFnVar = nullptr; -} - -bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { - return (ICVState == Other.ICVState) & - (HasThreadState == Other.HasThreadState) & - (ParallelTeamSize == Other.ParallelTeamSize); -} - -void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { - ICVState.assertEqual(Other.ICVState); - ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); - ASSERT(HasThreadState == Other.HasThreadState, nullptr); -} - -[[clang::loader_uninitialized]] Local<state::TeamStateTy> - ompx::state::TeamState; -[[clang::loader_uninitialized]] Local<state::ThreadStateTy **> - ompx::state::ThreadStates; - -namespace { - -int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, - int OutOfBoundsVal = -1) { - if (Level == 0) - return DefaultVal; - int LevelVar = omp_get_level(); - if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) - return OutOfBoundsVal; - int ActiveLevel = icv::ActiveLevel; - if (OMP_UNLIKELY(Level != ActiveLevel)) - return DefaultVal; - return Val; -} - -} // namespace - -void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - SharedMemorySmartStack.init(IsSPMD); - if (mapping::isInitialThreadInLevel0(IsSPMD)) { - TeamState.init(IsSPMD); - ThreadStates = nullptr; - KernelEnvironmentPtr = &KernelEnvironment; - KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; - } -} - -KernelEnvironmentTy &state::getKernelEnvironment() { - return *KernelEnvironmentPtr; -} - -KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { - return *KernelLaunchEnvironmentPtr; -} - -void state::enterDataEnvironment(IdentTy *Ident) { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - if (!config::mayUseThreadStates()) - return; - - unsigned TId = mapping::getThreadIdInBlock(); - ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>( - memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc")); - uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates); - if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { - uint32_t Bytes = - sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock(); - void *ThreadStatesPtr = - memory::allocGlobal(Bytes, "Thread state array allocation"); - __builtin_memset(ThreadStatesPtr, 0, Bytes); - if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), - reinterpret_cast<uintptr_t>(ThreadStatesPtr), - atomic::seq_cst, atomic::seq_cst)) - memory::freeGlobal(ThreadStatesPtr, - "Thread state array allocated multiple times"); - ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst), - "Expected valid thread states bit!"); - } - NewThreadState->init(ThreadStates[TId]); - TeamState.HasThreadState = true; - ThreadStates[TId] = NewThreadState; -} - -void state::exitDataEnvironment() { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - - unsigned TId = mapping::getThreadIdInBlock(); - resetStateForThread(TId); -} - -void state::resetStateForThread(uint32_t TId) { - if (!config::mayUseThreadStates()) - return; - if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) - return; - - ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; - memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc"); - ThreadStates[TId] = PreviousThreadState; -} - -void state::runAndCheckState(void(Func(void))) { - TeamStateTy OldTeamState = TeamState; - OldTeamState.assertEqual(TeamState); - - Func(); - - OldTeamState.assertEqual(TeamState); -} - -void state::assumeInitialState(bool IsSPMD) { - TeamStateTy InitialTeamState; - InitialTeamState.init(IsSPMD); - InitialTeamState.assertEqual(TeamState); - ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr); -} - -int state::getEffectivePTeamSize() { - int PTeamSize = state::ParallelTeamSize; - return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); -} - -extern "C" { -void omp_set_dynamic(int V) {} - -int omp_get_dynamic(void) { return 0; } - -void omp_set_num_threads(int V) { icv::NThreads = V; } - -int omp_get_max_threads(void) { - int NT = icv::NThreads; - return NT > 0 ? NT : mapping::getMaxTeamThreads(); -} - -int omp_get_level(void) { - int LevelVar = icv::Level; - ASSERT(LevelVar >= 0, nullptr); - return LevelVar; -} - -int omp_get_active_level(void) { return !!icv::ActiveLevel; } - -int omp_in_parallel(void) { return !!icv::ActiveLevel; } - -void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { - *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); - *ChunkSize = state::RunSchedChunk; -} - -void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { - icv::RunSched = (int)ScheduleKind; - state::RunSchedChunk = ChunkSize; -} - -int omp_get_ancestor_thread_num(int Level) { - return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); -} - -int omp_get_thread_num(void) { - return omp_get_ancestor_thread_num(omp_get_level()); -} - -int omp_get_team_size(int Level) { - return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); -} - -int omp_get_num_threads(void) { - return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); -} - -int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } - -int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } - -void omp_set_nested(int) {} - -int omp_get_nested(void) { return false; } - -void omp_set_max_active_levels(int Levels) { - icv::MaxActiveLevels = Levels > 0 ? 1 : 0; -} - -int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } - -omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } - -int omp_get_num_places(void) { return 0; } - -int omp_get_place_num_procs(int) { return omp_get_num_procs(); } - -void omp_get_place_proc_ids(int, int *) { - // TODO -} - -int omp_get_place_num(void) { return 0; } - -int omp_get_partition_num_places(void) { return 0; } - -void omp_get_partition_place_nums(int *) { - // TODO -} - -int omp_get_cancellation(void) { return 0; } - -void omp_set_default_device(int) {} - -int omp_get_default_device(void) { return -1; } - -int omp_get_num_devices(void) { return config::getNumDevices(); } - -int omp_get_device_num(void) { return config::getDeviceNum(); } - -int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } - -int omp_get_team_num() { return mapping::getBlockIdInKernel(); } - -int omp_get_initial_device(void) { return -1; } - -int omp_is_initial_device(void) { return 0; } -} - -extern "C" { -[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) { - return memory::allocShared(Bytes, "Frontend alloc shared"); -} - -[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { - memory::freeShared(Ptr, Bytes, "Frontend free shared"); -} - -void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } - -void *llvm_omp_target_dynamic_shared_alloc() { - return __kmpc_get_dynamic_shared(); -} - -void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } - -/// Allocate storage in shared memory to communicate arguments from the main -/// thread to the workers in generic mode. If we exceed -/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. -constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; - -[[clang::loader_uninitialized]] static Local<void *> - SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; -[[clang::loader_uninitialized]] static Local<void **> - SharedMemVariableSharingSpacePtr; - -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { - if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { - SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; - } else { - SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( - nArgs * sizeof(void *), "new extended args"); - ASSERT(SharedMemVariableSharingSpacePtr != nullptr, - "Nullptr returned by malloc!"); - } - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} - -void __kmpc_end_sharing_variables() { - if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) - memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); -} - -void __kmpc_get_shared_variables(void ***GlobalArgs) { - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} -} diff --git a/offload/DeviceRTL/src/Stub.cpp b/offload/DeviceRTL/src/Stub.cpp deleted file mode 100644 index e833423eb265..000000000000 --- a/offload/DeviceRTL/src/Stub.cpp +++ /dev/null @@ -1 +0,0 @@ -// This is an empty file used to create a device fatbinary. diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp deleted file mode 100644 index 2f1ed34a3f6d..000000000000 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ /dev/null @@ -1,379 +0,0 @@ -//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Include all synchronization. -// -//===----------------------------------------------------------------------===// - -#include "Synchronization.h" - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" - -using namespace ompx; - -namespace impl { - -/// Atomics -/// -///{ -///} - -/// AMDGCN Implementation -/// -///{ -#ifdef __AMDGPU__ - -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - // builtin_amdgcn_atomic_inc32 should expand to this switch when - // passed a runtime value, but does not do so yet. Workaround here. - -#define ScopeSwitch(ORDER) \ - switch (MemScope) { \ - case atomic::MemScopeTy::system: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \ - case atomic::MemScopeTy::device: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \ - case atomic::MemScopeTy::workgroup: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \ - case atomic::MemScopeTy::wavefront: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "wavefront"); \ - case atomic::MemScopeTy::single: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "singlethread"); \ - } - -#define Case(ORDER) \ - case ORDER: \ - ScopeSwitch(ORDER) - - switch (Ordering) { - default: - __builtin_unreachable(); - Case(atomic::relaxed); - Case(atomic::acquire); - Case(atomic::release); - Case(atomic::acq_rel); - Case(atomic::seq_cst); -#undef Case -#undef ScopeSwitch - } -} - -[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker; - -void namedBarrierInit() { - // Don't have global ctors, and shared memory is not zero init - atomic::store(&namedBarrierTracker, 0u, atomic::release); -} - -void namedBarrier() { - uint32_t NumThreads = omp_get_num_threads(); - // assert(NumThreads % 32 == 0); - - uint32_t WarpSize = mapping::getWarpSize(); - uint32_t NumWaves = NumThreads / WarpSize; - - fence::team(atomic::acquire); - - // named barrier implementation for amdgcn. - // Uses two 16 bit unsigned counters. One for the number of waves to have - // reached the barrier, and one to count how many times the barrier has been - // passed. These are packed in a single atomically accessed 32 bit integer. - // Low bits for the number of waves, assumed zero before this call. - // High bits to count the number of times the barrier has been passed. - - // precondition: NumWaves != 0; - // invariant: NumWaves * WarpSize == NumThreads; - // precondition: NumWaves < 0xffffu; - - // Increment the low 16 bits once, using the lowest active thread. - if (mapping::isLeaderInWarp()) { - uint32_t load = atomic::add(&namedBarrierTracker, 1, - atomic::relaxed); // commutative - - // Record the number of times the barrier has been passed - uint32_t generation = load & 0xffff0000u; - - if ((load & 0x0000ffffu) == (NumWaves - 1)) { - // Reached NumWaves in low bits so this is the last wave. - // Set low bits to zero and increment high bits - load += 0x00010000u; // wrap is safe - load &= 0xffff0000u; // because bits zeroed second - - // Reset the wave counter and release the waiting waves - atomic::store(&namedBarrierTracker, load, atomic::relaxed); - } else { - // more waves still to go, spin until generation counter changes - do { - __builtin_amdgcn_s_sleep(0); - load = atomic::load(&namedBarrierTracker, atomic::relaxed); - } while ((load & 0xffff0000u) == generation); - } - } - fence::team(atomic::release); -} - -void fenceTeam(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::workgroup); -} - -void fenceKernel(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::device); -} - -void fenceSystem(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::system); -} - -void syncWarp(__kmpc_impl_lanemask_t) { - // This is a no-op on current AMDGPU hardware but it is used by the optimizer - // to enforce convergent behaviour between control flow graphs. - __builtin_amdgcn_wave_barrier(); -} - -void syncThreads(atomic::OrderingTy Ordering) { - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst); - - __builtin_amdgcn_s_barrier(); - - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst); -} -void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } - -// TODO: Don't have wavefront lane locks. Possibly can't have them. -void unsetLock(omp_lock_t *) { __builtin_trap(); } -int testLock(omp_lock_t *) { __builtin_trap(); } -void initLock(omp_lock_t *) { __builtin_trap(); } -void destroyLock(omp_lock_t *) { __builtin_trap(); } -void setLock(omp_lock_t *) { __builtin_trap(); } - -constexpr uint32_t UNSET = 0; -constexpr uint32_t SET = 1; - -void unsetCriticalLock(omp_lock_t *Lock) { - (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel); -} - -void setCriticalLock(omp_lock_t *Lock) { - uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1; - if (mapping::getThreadIdInWarp() == LowestActiveThread) { - fenceKernel(atomic::release); - while ( - !cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) { - __builtin_amdgcn_s_sleep(32); - } - fenceKernel(atomic::acquire); - } -} - -#endif -///} - -/// NVPTX Implementation -/// -///{ -#ifdef __NVPTX__ - -uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - return __nvvm_atom_inc_gen_ui(Address, Val); -} - -void namedBarrierInit() {} - -void namedBarrier() { - uint32_t NumThreads = omp_get_num_threads(); - ASSERT(NumThreads % 32 == 0, nullptr); - - // The named barrier for active parallel threads of a team in an L1 parallel - // region to synchronize with each other. - constexpr int BarrierNo = 7; - __nvvm_barrier_sync_cnt(BarrierNo, NumThreads); -} - -void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); } - -void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); } - -void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); } - -void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } - -void syncThreads(atomic::OrderingTy Ordering) { - constexpr int BarrierNo = 8; - __nvvm_barrier_sync(BarrierNo); -} - -void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); } - -constexpr uint32_t OMP_SPIN = 1000; -constexpr uint32_t UNSET = 0; -constexpr uint32_t SET = 1; - -// TODO: This seems to hide a bug in the declare variant handling. If it is -// called before it is defined -// here the overload won't happen. Investigate lalter! -void unsetLock(omp_lock_t *Lock) { - (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst); -} - -int testLock(omp_lock_t *Lock) { - return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst); -} - -void initLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void setLock(omp_lock_t *Lock) { - // TODO: not sure spinning is a good idea here.. - while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, - atomic::seq_cst) != UNSET) { - int32_t start = __nvvm_read_ptx_sreg_clock(); - int32_t now; - for (;;) { - now = __nvvm_read_ptx_sreg_clock(); - int32_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) { - break; - } - } - } // wait for 0 to be the read value -} - -void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } - -#endif -///} - -} // namespace impl - -void synchronize::init(bool IsSPMD) { - if (!IsSPMD) - impl::namedBarrierInit(); -} - -void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } - -void synchronize::threads(atomic::OrderingTy Ordering) { - impl::syncThreads(Ordering); -} - -void synchronize::threadsAligned(atomic::OrderingTy Ordering) { - impl::syncThreadsAligned(Ordering); -} - -void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } - -void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); } - -void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } - -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - return impl::atomicInc(Addr, V, Ordering, MemScope); -} - -void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); } - -void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); } - -extern "C" { -void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} - -void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { - __kmpc_barrier(Loc, TId); - return 0; -} - -void __kmpc_barrier(IdentTy *Loc, int32_t TId) { - if (mapping::isSPMDMode()) - return __kmpc_barrier_simple_spmd(Loc, TId); - - // Generic parallel regions are run with multiple of the warp size or single - // threaded, in the latter case we need to stop here. - if (omp_get_num_threads() == 1) - return __kmpc_flush(Loc); - - impl::namedBarrier(); -} - -[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { - synchronize::threadsAligned(atomic::OrderingTy::seq_cst); -} - -[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc, - int32_t TId) { - synchronize::threads(atomic::OrderingTy::seq_cst); -} - -int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { - return omp_get_thread_num() == 0; -} - -void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) { - return omp_get_thread_num() == Filter; -} - -void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { - return __kmpc_master(Loc, TId); -} - -void __kmpc_end_single(IdentTy *Loc, int32_t TId) { - // The barrier is explicitly called. -} - -void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); } - -uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); } - -void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); } - -void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { - impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); -} - -void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { - impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); -} - -void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } - -void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } - -void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } - -void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } - -int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } - -void ompx_sync_block(int Ordering) { - impl::syncThreadsAligned(atomic::OrderingTy(Ordering)); -} -void ompx_sync_block_acq_rel() { - impl::syncThreadsAligned(atomic::OrderingTy::acq_rel); -} -void ompx_sync_block_divergent(int Ordering) { - impl::syncThreads(atomic::OrderingTy(Ordering)); -} -} // extern "C" diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp deleted file mode 100644 index d0be0ace50df..000000000000 --- a/offload/DeviceRTL/src/Tasking.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Task implementation support. -// -// TODO: We should not allocate and execute the task in two steps. A new API is -// needed for that though. -// -//===----------------------------------------------------------------------===// - -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "State.h" - -using namespace ompx; - -extern "C" { - -TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, - size_t TaskSizeInclPrivateValues, - size_t SharedValuesSize, - TaskFnTy TaskFn) { - auto TaskSizeInclPrivateValuesPadded = - utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *)); - auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize; - TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( - TaskSizeTotal, "explicit task descriptor"); - TaskDescriptor->Payload = - utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded); - TaskDescriptor->TaskFn = TaskFn; - - return TaskDescriptor; -} - -int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); -} - -int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int32_t, - void *, int32_t, void *) { - state::DateEnvironmentRAII DERAII(Loc); - - TaskDescriptor->TaskFn(0, TaskDescriptor); - - memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); - return 0; -} - -void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - state::enterDataEnvironment(Loc); -} - -void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - state::exitDataEnvironment(); - - memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); -} - -void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, - void *) {} - -void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {} - -void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {} - -int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; } - -int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; } - -void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int, - uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, - int32_t, uint64_t, void *) { - // Skip task entirely if empty iteration space. - if (*LowerBound > *UpperBound) - return; - - // The compiler has already stored lb and ub in the TaskDescriptorTy structure - // as we are using a single task to execute the entire loop, we can leave - // the initial task_t untouched - __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); -} - -int omp_in_final(void) { - // treat all tasks as final... Specs may expect runtime to keep - // track more precisely if a task was actively set by users... This - // is not explicitly specified; will treat as if runtime can - // actively decide to put a non-final task into a final one. - return 1; -} - -int omp_get_max_task_priority(void) { return 0; } -} diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp deleted file mode 100644 index 59a2cc3f27ac..000000000000 --- a/offload/DeviceRTL/src/Workshare.cpp +++ /dev/null @@ -1,970 +0,0 @@ -//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of the KMPC interface -// for the loop construct plus other worksharing constructs that use the same -// interface as loops. -// -//===----------------------------------------------------------------------===// - -#include "Workshare.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -// TODO: -struct DynamicScheduleTracker { - int64_t Chunk; - int64_t LoopUpperBound; - int64_t NextLowerBound; - int64_t Stride; - kmp_sched_t ScheduleType; - DynamicScheduleTracker *NextDST; -}; - -#define ASSERT0(...) - -// used by the library for the interface with the app -#define DISPATCH_FINISHED 0 -#define DISPATCH_NOTFINISHED 1 - -// used by dynamic scheduling -#define FINISHED 0 -#define NOT_FINISHED 1 -#define LAST_CHUNK 2 - -// TODO: This variable is a hack inherited from the old runtime. -[[clang::loader_uninitialized]] static Local<uint64_t> Cnt; - -template <typename T, typename ST> struct omptarget_nvptx_LoopSupport { - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling with chunk - - // Generic implementation of OMP loop scheduling with static policy - /*! \brief Calculate initial bounds for static loop and stride - * @param[in] loc location in code of the call (not used here) - * @param[in] global_tid global thread id - * @param[in] schetype type of scheduling (see omptarget-nvptx.h) - * @param[in] plastiter pointer to last iteration - * @param[in,out] pointer to loop lower bound. it will contain value of - * lower bound of first chunk - * @param[in,out] pointer to loop upper bound. It will contain value of - * upper bound of first chunk - * @param[in,out] pointer to loop stride. It will contain value of stride - * between two successive chunks executed by the same thread - * @param[in] loop increment bump - * @param[in] chunk size - */ - - // helper function for static chunk - static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, - T entityId, T numberOfEntities) { - // each thread executes multiple chunks all of the same size, except - // the last one - // distance between two successive chunks - stride = numberOfEntities * chunk; - lb = lb + entityId * chunk; - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - // Say ub' is the beginning of the last chunk. Then who ever has a - // lower bound plus a multiple of the increment equal to ub' is - // the last one. - T beginingLastChunk = inputUb - (inputUb % chunk); - last = ((beginingLastChunk - lb) % stride) == 0; - } - - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling without chunk - - // helper function for static no chunk - static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, - T entityId, T numberOfEntities) { - // No chunk size specified. Each thread or warp gets at most one - // chunk; chunks are all almost of equal size - T loopSize = ub - lb + 1; - - chunk = loopSize / numberOfEntities; - T leftOver = loopSize - chunk * numberOfEntities; - - if (entityId < leftOver) { - chunk++; - lb = lb + entityId * chunk; - } else { - lb = lb + entityId * chunk + leftOver; - } - - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - last = lb <= inputUb && inputUb <= ub; - stride = loopSize; // make sure we only do 1 chunk per warp - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for Static Init - - static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter, - T *plower, T *pupper, ST *pstride, ST chunk, - bool IsSPMDExecutionMode) { - int32_t gtid = omp_get_thread_num(); - int numberOfActiveOMPThreads = omp_get_num_threads(); - - // All warps that are in excess of the maximum requested, do - // not execute the loop - ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, - "current thread is not needed here; error"); - - // copy - int lastiter = 0; - T lb = *plower; - T ub = *pupper; - ST stride = *pstride; - - // init - switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { - case kmp_sched_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_static_balanced_chunk: { - if (chunk > 0) { - // round up to make sure the chunk is enough to cover all iterations - T tripCount = ub - lb + 1; // +1 because ub is inclusive - T span = (tripCount + numberOfActiveOMPThreads - 1) / - numberOfActiveOMPThreads; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - if (ub > oldUb) - ub = oldUb; - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - case kmp_sched_distr_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), - omp_get_num_teams()); - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_distr_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), - omp_get_num_teams()); - break; - } - case kmp_sched_distr_static_chunk_sched_static_chunkone: { - ForStaticChunk(lastiter, lb, ub, stride, chunk, - numberOfActiveOMPThreads * omp_get_team_num() + gtid, - omp_get_num_teams() * numberOfActiveOMPThreads); - break; - } - default: { - // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - } - // copy back - *plastiter = lastiter; - *plower = lb; - *pupper = ub; - *pstride = stride; - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch Init - - static int OrderedSchedule(kmp_sched_t schedule) { - return schedule >= kmp_sched_ordered_first && - schedule <= kmp_sched_ordered_last; - } - - static void dispatch_init(IdentTy *loc, int32_t threadId, - kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, - DynamicScheduleTracker *DST) { - int tid = mapping::getThreadIdInBlock(); - T tnum = omp_get_num_threads(); - T tripCount = ub - lb + 1; // +1 because ub is inclusive - ASSERT0(LT_FUSSY, threadId < tnum, - "current thread is not needed here; error"); - - /* Currently just ignore the monotonic and non-monotonic modifiers - * (the compiler isn't producing them * yet anyway). - * When it is we'll want to look at them somewhere here and use that - * information to add to our schedule choice. We shouldn't need to pass - * them on, they merely affect which schedule we can legally choose for - * various dynamic cases. (In particular, whether or not a stealing scheme - * is legal). - */ - schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); - - // Process schedule. - if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { - if (OrderedSchedule(schedule)) - __kmpc_barrier(loc, threadId); - schedule = kmp_sched_static_chunk; - chunk = tripCount; // one thread gets the whole loop - } else if (schedule == kmp_sched_runtime) { - // process runtime - omp_sched_t rtSched; - int ChunkInt; - omp_get_schedule(&rtSched, &ChunkInt); - chunk = ChunkInt; - switch (rtSched) { - case omp_sched_static: { - if (chunk > 0) - schedule = kmp_sched_static_chunk; - else - schedule = kmp_sched_static_nochunk; - break; - } - case omp_sched_auto: { - schedule = kmp_sched_static_chunk; - chunk = 1; - break; - } - case omp_sched_dynamic: - case omp_sched_guided: { - schedule = kmp_sched_dynamic; - break; - } - } - } else if (schedule == kmp_sched_auto) { - schedule = kmp_sched_static_chunk; - chunk = 1; - } else { - // ASSERT(LT_FUSSY, - // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - // "unknown schedule %d & chunk %lld\n", (int)schedule, - // (long long)chunk); - } - - // init schedules - if (schedule == kmp_sched_static_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_static_balanced_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - // round up to make sure the chunk is enough to cover all iterations - T span = (tripCount + tnum - 1) / tnum; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - if (ub > oldUb) - ub = oldUb; - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_static_nochunk) { - ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { - // save data - DST->ScheduleType = schedule; - if (chunk < 1) - chunk = 1; - DST->Chunk = chunk; - DST->LoopUpperBound = ub; - DST->NextLowerBound = lb; - __kmpc_barrier(loc, threadId); - if (tid == 0) { - Cnt = 0; - fence::team(atomic::seq_cst); - } - __kmpc_barrier(loc, threadId); - } - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch next - - static uint64_t NextIter() { - __kmpc_impl_lanemask_t active = mapping::activemask(); - uint32_t leader = utils::ffs(active) - 1; - uint32_t change = utils::popc(active); - __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT(); - unsigned int rank = utils::popc(active & lane_mask_lt); - uint64_t warp_res = 0; - if (rank == 0) { - warp_res = atomic::add(&Cnt, change, atomic::seq_cst); - } - warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize()); - return warp_res + rank; - } - - static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, - T loopUpperBound) { - T N = NextIter(); - lb = loopLowerBound + N * chunkSize; - ub = lb + chunkSize - 1; // Clang uses i <= ub - - // 3 result cases: - // a. lb and ub < loopUpperBound --> NOT_FINISHED - // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> - // NOT_FINISHED - // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED - // a. - if (lb <= loopUpperBound && ub < loopUpperBound) { - return NOT_FINISHED; - } - // b. - if (lb <= loopUpperBound) { - ub = loopUpperBound; - return LAST_CHUNK; - } - // c. if we are here, we are in case 'c' - lb = loopUpperBound + 2; - ub = loopUpperBound + 1; - return FINISHED; - } - - static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, - T *plower, T *pupper, ST *pstride, - DynamicScheduleTracker *DST) { - // ID of a thread in its own warp - - // automatically selects thread or warp ID based on selected implementation - ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), - "current thread is not needed here; error"); - // retrieve schedule - kmp_sched_t schedule = DST->ScheduleType; - - // xxx reduce to one - if (schedule == kmp_sched_static_chunk || - schedule == kmp_sched_static_nochunk) { - T myLb = DST->NextLowerBound; - T ub = DST->LoopUpperBound; - // finished? - if (myLb > ub) { - return DISPATCH_FINISHED; - } - // not finished, save current bounds - ST chunk = DST->Chunk; - *plower = myLb; - T myUb = myLb + chunk - 1; // Clang uses i <= ub - if (myUb > ub) - myUb = ub; - *pupper = myUb; - *plast = (int32_t)(myUb == ub); - - // increment next lower bound by the stride - ST stride = DST->Stride; - DST->NextLowerBound = myLb + stride; - return DISPATCH_NOTFINISHED; - } - ASSERT0(LT_FUSSY, - schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - "bad sched"); - T myLb, myUb; - int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound, - DST->LoopUpperBound); - - if (finished == FINISHED) - return DISPATCH_FINISHED; - - // not finished (either not finished or last chunk) - *plast = (int32_t)(finished == LAST_CHUNK); - *plower = myLb; - *pupper = myUb; - *pstride = 1; - - return DISPATCH_NOTFINISHED; - } - - static void dispatch_fini() { - // nothing - } - - //////////////////////////////////////////////////////////////////////////////// - // end of template class that encapsulate all the helper functions - //////////////////////////////////////////////////////////////////////////////// -}; - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (dyn loops) -//////////////////////////////////////////////////////////////////////////////// - -// TODO: Expand the dispatch API to take a DST pointer which can then be -// allocated properly without malloc. -// For now, each team will contain an LDS pointer (ThreadDST) to a global array -// of references to the DST structs allocated (in global memory) for each thread -// in the team. The global memory array is allocated during the init phase if it -// was not allocated already and will be deallocated when the dispatch phase -// ends: -// -// __kmpc_dispatch_init -// -// ** Dispatch loop ** -// -// __kmpc_dispatch_deinit -// -[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **> - ThreadDST; - -// Create a new DST, link the current one, and define the new as current. -static DynamicScheduleTracker *pushDST() { - int32_t ThreadIndex = mapping::getThreadIdInBlock(); - // Each block will allocate an array of pointers to DST structs. The array is - // equal in length to the number of threads in that block. - if (!ThreadDST) { - // Allocate global memory array of pointers to DST structs: - if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0) - ThreadDST = static_cast<DynamicScheduleTracker **>( - memory::allocGlobal(mapping::getNumberOfThreadsInBlock() * - sizeof(DynamicScheduleTracker *), - "new ThreadDST array")); - synchronize::threads(atomic::seq_cst); - - // Initialize the array pointers: - ThreadDST[ThreadIndex] = nullptr; - } - - // Create a DST struct for the current thread: - DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>( - memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST")); - *NewDST = DynamicScheduleTracker({0}); - - // Add the new DST struct to the array of DST structs: - NewDST->NextDST = ThreadDST[ThreadIndex]; - ThreadDST[ThreadIndex] = NewDST; - return NewDST; -} - -// Return the current DST. -static DynamicScheduleTracker *peekDST() { - return ThreadDST[mapping::getThreadIdInBlock()]; -} - -// Pop the current DST and restore the last one. -static void popDST() { - int32_t ThreadIndex = mapping::getThreadIdInBlock(); - DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex]; - DynamicScheduleTracker *OldDST = CurrentDST->NextDST; - memory::freeGlobal(CurrentDST, "remove DST"); - ThreadDST[ThreadIndex] = OldDST; - - // Check if we need to deallocate the global array. Ensure all threads - // in the block have finished deallocating the individual DSTs. - synchronize::threads(atomic::seq_cst); - if (!ThreadDST[ThreadIndex] && !ThreadIndex) { - memory::freeGlobal(ThreadDST, "remove ThreadDST array"); - ThreadDST = nullptr; - } - synchronize::threads(atomic::seq_cst); -} - -void workshare::init(bool IsSPMD) { - if (mapping::isInitialThreadInLevel0(IsSPMD)) - ThreadDST = nullptr; -} - -extern "C" { - -// init -void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, - int32_t lb, int32_t ub, int32_t st, int32_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, - uint32_t lb, uint32_t ub, int32_t st, - int32_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, - int64_t lb, int64_t ub, int64_t st, int64_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, - uint64_t lb, uint64_t ub, int64_t st, - int64_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -// next -int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, - int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, - int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -// fini -void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini(); -} - -// deinit -void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (static loops) -//////////////////////////////////////////////////////////////////////////////// - -void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} - -void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} -} - -namespace ompx { - -/// Helper class to hide the generic loop nest and provide the template argument -/// throughout. -template <typename Ty> class StaticLoopChunker { - - /// Generic loop nest that handles block and/or thread distribution in the - /// absence of user specified chunk sizes. This implicitly picks a block chunk - /// size equal to the number of threads in the block and a thread chunk size - /// equal to one. In contrast to the chunked version we can get away with a - /// single loop in this case - static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, - Ty NumBlocks, Ty BId, Ty NumThreads, - Ty TId, Ty NumIters, - uint8_t OneIterationPerThread) { - Ty KernelIteration = NumBlocks * NumThreads; - - // Start index in the normalized space. - Ty IV = BId * NumThreads + TId; - ASSERT(IV >= 0, "Bad index"); - - // Cover the entire iteration space, assumptions in the caller might allow - // to simplify this loop to a conditional. - if (IV < NumIters) { - do { - - // Execute the loop body. - LoopBody(IV, Arg); - - // Every thread executed one block and thread chunk now. - IV += KernelIteration; - - if (OneIterationPerThread) - return; - - } while (IV < NumIters); - } - } - - /// Generic loop nest that handles block and/or thread distribution in the - /// presence of user specified chunk sizes (for at least one of them). - static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg, - Ty BlockChunk, Ty NumBlocks, Ty BId, - Ty ThreadChunk, Ty NumThreads, Ty TId, - Ty NumIters, - uint8_t OneIterationPerThread) { - Ty KernelIteration = NumBlocks * BlockChunk; - - // Start index in the chunked space. - Ty IV = BId * BlockChunk + TId; - ASSERT(IV >= 0, "Bad index"); - - // Cover the entire iteration space, assumptions in the caller might allow - // to simplify this loop to a conditional. - do { - - Ty BlockChunkLeft = - BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0; - Ty ThreadChunkLeft = - ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft; - - while (ThreadChunkLeft--) { - - // Given the blocking it's hard to keep track of what to execute. - if (IV >= NumIters) - return; - - // Execute the loop body. - LoopBody(IV, Arg); - - if (OneIterationPerThread) - return; - - ++IV; - } - - IV += KernelIteration; - - } while (IV < NumIters); - } - -public: - /// Worksharing `for`-loop. - /// \param[in] Loc Description of source location - /// \param[in] LoopBody Function which corresponds to loop body - /// \param[in] Arg Pointer to struct which contains loop body args - /// \param[in] NumIters Number of loop iterations - /// \param[in] NumThreads Number of GPU threads - /// \param[in] ThreadChunk Size of thread chunk - /// \param[in] OneIterationPerThread If true/nonzero, each thread executes - /// only one loop iteration or one thread chunk. This avoids an outer loop - /// over all loop iterations/chunks. - static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty NumThreads, Ty ThreadChunk, - uint8_t OneIterationPerThread) { - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(ThreadChunk >= 0, "Bad thread count"); - - // All threads need to participate but we don't know if we are in a - // parallel at all or if the user might have used a `num_threads` clause - // on the parallel and reduced the number compared to the block size. - // Since nested parallels are possible too we need to get the thread id - // from the `omp` getter and not the mapping directly. - Ty TId = omp_get_thread_num(); - - // There are no blocks involved here. - Ty BlockChunk = 0; - Ty NumBlocks = 1; - Ty BId = 0; - - // If the thread chunk is not specified we pick a default now. - if (ThreadChunk == 0) - ThreadChunk = 1; - - // If we know we have more threads than iterations we can indicate that to - // avoid an outer loop. - if (config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - - if (OneIterationPerThread) - ASSERT(NumThreads >= NumIters, "Broken assumption"); - - if (ThreadChunk != 1) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - } - - /// Worksharing `distribute`-loop. - /// \param[in] Loc Description of source location - /// \param[in] LoopBody Function which corresponds to loop body - /// \param[in] Arg Pointer to struct which contains loop body args - /// \param[in] NumIters Number of loop iterations - /// \param[in] BlockChunk Size of block chunk - /// \param[in] OneIterationPerThread If true/nonzero, each thread executes - /// only one loop iteration or one thread chunk. This avoids an outer loop - /// over all loop iterations/chunks. - static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty BlockChunk, - uint8_t OneIterationPerThread) { - ASSERT(icv::Level == 0, "Bad distribute"); - ASSERT(icv::ActiveLevel == 0, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); - - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(BlockChunk >= 0, "Bad block count"); - - // There are no threads involved here. - Ty ThreadChunk = 0; - Ty NumThreads = 1; - Ty TId = 0; - - // All teams need to participate. - Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); - Ty BId = mapping::getBlockIdInKernel(); - - // If the block chunk is not specified we pick a default now. - if (BlockChunk == 0) - BlockChunk = NumThreads; - - // If we know we have more blocks than iterations we can indicate that to - // avoid an outer loop. - if (config::getAssumeTeamsOversubscription()) { - OneIterationPerThread = true; - } - - if (OneIterationPerThread) - ASSERT(NumBlocks >= NumIters, "Broken assumption"); - - if (BlockChunk != NumThreads) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - - ASSERT(icv::Level == 0, "Bad distribute"); - ASSERT(icv::ActiveLevel == 0, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); - } - - /// Worksharing `distribute parallel for`-loop. - /// \param[in] Loc Description of source location - /// \param[in] LoopBody Function which corresponds to loop body - /// \param[in] Arg Pointer to struct which contains loop body args - /// \param[in] NumIters Number of loop iterations - /// \param[in] NumThreads Number of GPU threads - /// \param[in] BlockChunk Size of block chunk - /// \param[in] ThreadChunk Size of thread chunk - /// \param[in] OneIterationPerThread If true/nonzero, each thread executes - /// only one loop iteration or one thread chunk. This avoids an outer loop - /// over all loop iterations/chunks. - static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), - void *Arg, Ty NumIters, Ty NumThreads, - Ty BlockChunk, Ty ThreadChunk, - uint8_t OneIterationPerThread) { - ASSERT(icv::Level == 1, "Bad distribute"); - ASSERT(icv::ActiveLevel == 1, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(BlockChunk >= 0, "Bad block count"); - ASSERT(ThreadChunk >= 0, "Bad thread count"); - - // All threads need to participate but the user might have used a - // `num_threads` clause on the parallel and reduced the number compared to - // the block size. - Ty TId = mapping::getThreadIdInBlock(); - - // All teams need to participate. - Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); - Ty BId = mapping::getBlockIdInKernel(); - - // If the block chunk is not specified we pick a default now. - if (BlockChunk == 0) - BlockChunk = NumThreads; - - // If the thread chunk is not specified we pick a default now. - if (ThreadChunk == 0) - ThreadChunk = 1; - - // If we know we have more threads (across all blocks) than iterations we - // can indicate that to avoid an outer loop. - if (config::getAssumeTeamsOversubscription() & - config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - - if (OneIterationPerThread) - ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); - - if (BlockChunk != NumThreads || ThreadChunk != 1) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - - ASSERT(icv::Level == 1, "Bad distribute"); - ASSERT(icv::ActiveLevel == 1, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - } -}; - -} // namespace ompx - -#define OMP_LOOP_ENTRY(BW, TY) \ - [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY block_chunk, TY thread_chunk, \ - uint8_t one_iteration_per_thread) { \ - ompx::StaticLoopChunker<TY>::DistributeFor( \ - loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \ - one_iteration_per_thread); \ - } \ - [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ - void *arg, TY num_iters, TY block_chunk, \ - uint8_t one_iteration_per_thread) { \ - ompx::StaticLoopChunker<TY>::Distribute( \ - loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \ - } \ - [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) { \ - ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \ - thread_chunk, one_iteration_per_thread); \ - } - -extern "C" { -OMP_LOOP_ENTRY(_4, int32_t) -OMP_LOOP_ENTRY(_4u, uint32_t) -OMP_LOOP_ENTRY(_8, int64_t) -OMP_LOOP_ENTRY(_8u, uint64_t) -} diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake index 0236f5f0b698..5a27a81c736b 100644 --- a/offload/cmake/caches/AMDGPUBot.cmake +++ b/offload/cmake/caches/AMDGPUBot.cmake @@ -15,7 +15,10 @@ set(LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;offload;flang-rt" CACHE STRING "") set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "") set(LLVM_TARGETS_TO_BUILD "host;AMDGPU;SPIRV" CACHE STRING "") -set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "") +set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 16" CACHE STRING "") set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "") set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "") + +set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "openmp" CACHE STRING "") diff --git a/offload/cmake/caches/AMDGPULibcBot.cmake b/offload/cmake/caches/AMDGPULibcBot.cmake index a772043c7966..798f080a41ad 100644 --- a/offload/cmake/caches/AMDGPULibcBot.cmake +++ b/offload/cmake/caches/AMDGPULibcBot.cmake @@ -17,5 +17,6 @@ set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "") set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "") set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "") -set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc" CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;libc;libcxxabi;libcxx" CACHE STRING "") set(RUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_TEST_JOBS 4 CACHE STRING "") diff --git a/offload/cmake/caches/Offload.cmake b/offload/cmake/caches/Offload.cmake index 5533a6508f5d..3747a1d3eb29 100644 --- a/offload/cmake/caches/Offload.cmake +++ b/offload/cmake/caches/Offload.cmake @@ -5,5 +5,5 @@ set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda CACHE STRING "") set(RUNTIMES_nvptx64-nvidia-cuda_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/NVPTX.cmake" CACHE STRING "") set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "") -set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "") -set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "") +set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "") diff --git a/offload/include/device.h b/offload/include/device.h index 1e85bb1876c8..bf93ce0460ae 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -33,7 +33,9 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" +#include "GlobalHandler.h" #include "PluginInterface.h" + using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy; // Forward declarations. diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td index d1baa28687fb..5b54c79d83f9 100644 --- a/offload/liboffload/API/Device.td +++ b/offload/liboffload/API/Device.td @@ -28,10 +28,13 @@ def ol_device_info_t : Enum { TaggedEtor<"TYPE", "ol_device_type_t", "type of the device">, TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">, TaggedEtor<"NAME", "char[]", "Device name">, + TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">, TaggedEtor<"VENDOR", "char[]", "Device vendor">, TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">, TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">, TaggedEtor<"MAX_WORK_GROUP_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work group size in each dimension">, + TaggedEtor<"MAX_WORK_SIZE", "uint32_t", "Maximum total work items">, + TaggedEtor<"MAX_WORK_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work items in each dimension">, TaggedEtor<"VENDOR_ID", "uint32_t", "A unique vendor device identifier assigned by PCI-SIG">, TaggedEtor<"NUM_COMPUTE_UNITS", "uint32_t", "The number of parallel compute units available to the device">, TaggedEtor<"MAX_CLOCK_FREQUENCY", "uint32_t", "The maximum configured clock frequency of this device in MHz">, diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 9d342e06127a..7e8e297831f4 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -47,10 +47,59 @@ struct ol_device_impl_t { ol_platform_handle_t Platform, InfoTreeNode &&DevInfo) : DeviceNum(DeviceNum), Device(Device), Platform(Platform), Info(std::forward<InfoTreeNode>(DevInfo)) {} + + ~ol_device_impl_t() { + assert(!OutstandingQueues.size() && + "Device object dropped with outstanding queues"); + } + int DeviceNum; GenericDeviceTy *Device; ol_platform_handle_t Platform; InfoTreeNode Info; + + llvm::SmallVector<__tgt_async_info *> OutstandingQueues; + std::mutex OutstandingQueuesMutex; + + /// If the device has any outstanding queues that are now complete, remove it + /// from the list and return it. + /// + /// Queues may be added to the outstanding queue list by olDestroyQueue if + /// they are destroyed but not completed. + __tgt_async_info *getOutstandingQueue() { + // Not locking the `size()` access is fine here - In the worst case we + // either miss a queue that exists or loop through an empty array after + // taking the lock. Both are sub-optimal but not that bad. + if (OutstandingQueues.size()) { + std::lock_guard<std::mutex> Lock(OutstandingQueuesMutex); + + // As queues are pulled and popped from this list, longer running queues + // naturally bubble to the start of the array. Hence looping backwards. + for (auto Q = OutstandingQueues.rbegin(); Q != OutstandingQueues.rend(); + Q++) { + if (!Device->hasPendingWork(*Q)) { + auto OutstandingQueue = *Q; + *Q = OutstandingQueues.back(); + OutstandingQueues.pop_back(); + return OutstandingQueue; + } + } + } + return nullptr; + } + + /// Complete all pending work for this device and perform any needed cleanup. + /// + /// After calling this function, no liboffload functions should be called with + /// this device handle. + llvm::Error destroy() { + llvm::Error Result = Plugin::success(); + for (auto Q : OutstandingQueues) + if (auto Err = Device->synchronize(Q, /*Release=*/true)) + Result = llvm::joinErrors(std::move(Result), std::move(Err)); + OutstandingQueues.clear(); + return Result; + } }; struct ol_platform_impl_t { @@ -58,23 +107,51 @@ struct ol_platform_impl_t { ol_platform_backend_t BackendType) : Plugin(std::move(Plugin)), BackendType(BackendType) {} std::unique_ptr<GenericPluginTy> Plugin; - std::vector<ol_device_impl_t> Devices; + llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices; ol_platform_backend_t BackendType; + + /// Complete all pending work for this platform and perform any needed + /// cleanup. + /// + /// After calling this function, no liboffload functions should be called with + /// this platform handle. + llvm::Error destroy() { + llvm::Error Result = Plugin::success(); + for (auto &D : Devices) + if (auto Err = D->destroy()) + Result = llvm::joinErrors(std::move(Result), std::move(Err)); + + if (auto Res = Plugin->deinit()) + Result = llvm::joinErrors(std::move(Result), std::move(Res)); + + return Result; + } }; struct ol_queue_impl_t { ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device) - : AsyncInfo(AsyncInfo), Device(Device) {} + : AsyncInfo(AsyncInfo), Device(Device), Id(IdCounter++) {} __tgt_async_info *AsyncInfo; ol_device_handle_t Device; + // A unique identifier for the queue + size_t Id; + static std::atomic<size_t> IdCounter; }; +std::atomic<size_t> ol_queue_impl_t::IdCounter(0); struct ol_event_impl_t { - ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue) - : EventInfo(EventInfo), Queue(Queue) {} + ol_event_impl_t(void *EventInfo, ol_device_handle_t Device, + ol_queue_handle_t Queue) + : EventInfo(EventInfo), Device(Device), QueueId(Queue->Id), Queue(Queue) { + } // EventInfo may be null, in which case the event should be considered always // complete void *EventInfo; + ol_device_handle_t Device; + size_t QueueId; + // Events may outlive the queue - don't assume this is always valid. + // It is provided only to implement OL_EVENT_INFO_QUEUE. Use QueueId to check + // for queue equality instead. ol_queue_handle_t Queue; }; @@ -131,7 +208,7 @@ struct OffloadContext { ol_device_handle_t HostDevice() { // The host platform is always inserted last - return &Platforms.back().Devices[0]; + return Platforms.back().Devices[0].get(); } static OffloadContext &get() { @@ -190,8 +267,8 @@ Error initPlugins(OffloadContext &Context) { auto Info = Device->obtainInfoImpl(); if (auto Err = Info.takeError()) return Err; - Platform.Devices.emplace_back(DevNum, Device, &Platform, - std::move(*Info)); + Platform.Devices.emplace_back(std::make_unique<ol_device_impl_t>( + DevNum, Device, &Platform, std::move(*Info))); } } } @@ -199,7 +276,8 @@ Error initPlugins(OffloadContext &Context) { // Add the special host device auto &HostPlatform = Context.Platforms.emplace_back( ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST}); - HostPlatform.Devices.emplace_back(-1, nullptr, nullptr, InfoTreeNode{}); + HostPlatform.Devices.emplace_back( + std::make_unique<ol_device_impl_t>(-1, nullptr, nullptr, InfoTreeNode{})); Context.HostDevice()->Platform = &HostPlatform; Context.TracingEnabled = std::getenv("OFFLOAD_TRACE"); @@ -240,7 +318,7 @@ Error olShutDown_impl() { if (!P.Plugin || !P.Plugin->is_initialized()) continue; - if (auto Res = P.Plugin->deinit()) + if (auto Res = P.destroy()) Result = llvm::joinErrors(std::move(Result), std::move(Res)); } @@ -367,6 +445,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, // Retrieve properties from the plugin interface switch (PropName) { case OL_DEVICE_INFO_NAME: + case OL_DEVICE_INFO_PRODUCT_NAME: case OL_DEVICE_INFO_VENDOR: case OL_DEVICE_INFO_DRIVER_VERSION: { // String values @@ -377,6 +456,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, } case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: + case OL_DEVICE_INFO_MAX_WORK_SIZE: case OL_DEVICE_INFO_VENDOR_ID: case OL_DEVICE_INFO_NUM_COMPUTE_UNITS: case OL_DEVICE_INFO_ADDRESS_BITS: @@ -393,6 +473,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.write(static_cast<uint32_t>(Value)); } + case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION: case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: { // {x, y, z} triples ol_dimensions_t Out{0, 0, 0}; @@ -431,6 +512,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, assert(Device == OffloadContext::get().HostDevice()); InfoWriter Info(PropSize, PropValue, PropSizeRet); + constexpr auto uint32_max = std::numeric_limits<uint32_t>::max(); + switch (PropName) { case OL_DEVICE_INFO_PLATFORM: return Info.write<void *>(Device->Platform); @@ -438,6 +521,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_HOST); case OL_DEVICE_INFO_NAME: return Info.writeString("Virtual Host Device"); + case OL_DEVICE_INFO_PRODUCT_NAME: + return Info.writeString("Virtual Host Device"); case OL_DEVICE_INFO_VENDOR: return Info.writeString("Liboffload"); case OL_DEVICE_INFO_DRIVER_VERSION: @@ -446,6 +531,11 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, return Info.write<uint32_t>(1); case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: return Info.write<ol_dimensions_t>(ol_dimensions_t{1, 1, 1}); + case OL_DEVICE_INFO_MAX_WORK_SIZE: + return Info.write<uint32_t>(uint32_max); + case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION: + return Info.write<ol_dimensions_t>( + ol_dimensions_t{uint32_max, uint32_max, uint32_max}); case OL_DEVICE_INFO_VENDOR_ID: return Info.write<uint32_t>(0); case OL_DEVICE_INFO_NUM_COMPUTE_UNITS: @@ -505,7 +595,7 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device, Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) { for (auto &Platform : OffloadContext::get().Platforms) { for (auto &Device : Platform.Devices) { - if (!Callback(&Device, UserData)) { + if (!Callback(Device.get(), UserData)) { break; } } @@ -566,14 +656,46 @@ Error olMemFree_impl(void *Address) { Error olCreateQueue_impl(ol_device_handle_t Device, ol_queue_handle_t *Queue) { auto CreatedQueue = std::make_unique<ol_queue_impl_t>(nullptr, Device); - if (auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo))) + + auto OutstandingQueue = Device->getOutstandingQueue(); + if (OutstandingQueue) { + // The queue is empty, but we still need to sync it to release any temporary + // memory allocations or do other cleanup. + if (auto Err = + Device->Device->synchronize(OutstandingQueue, /*Release=*/false)) + return Err; + CreatedQueue->AsyncInfo = OutstandingQueue; + } else if (auto Err = + Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo))) { return Err; + } *Queue = CreatedQueue.release(); return Error::success(); } -Error olDestroyQueue_impl(ol_queue_handle_t Queue) { return olDestroy(Queue); } +Error olDestroyQueue_impl(ol_queue_handle_t Queue) { + auto *Device = Queue->Device; + // This is safe; as soon as olDestroyQueue is called it is not possible to add + // any more work to the queue, so if it's finished now it will remain finished + // forever. + auto Res = Device->Device->hasPendingWork(Queue->AsyncInfo); + if (!Res) + return Res.takeError(); + + if (!*Res) { + // The queue is complete, so sync it and throw it back into the pool. + if (auto Err = Device->Device->synchronize(Queue->AsyncInfo, + /*Release=*/true)) + return Err; + } else { + // The queue still has outstanding work. Store it so we can check it later. + std::lock_guard<std::mutex> Lock(Device->OutstandingQueuesMutex); + Device->OutstandingQueues.push_back(Queue->AsyncInfo); + } + + return olDestroy(Queue); +} Error olSyncQueue_impl(ol_queue_handle_t Queue) { // Host plugin doesn't have a queue set so it's not safe to call synchronize @@ -601,7 +723,7 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events, "olWaitEvents asked to wait on a NULL event"); // Do nothing if the event is for this queue or the event is always complete - if (Event->Queue == Queue || !Event->EventInfo) + if (Event->QueueId == Queue->Id || !Event->EventInfo) continue; if (auto Err = Device->waitEvent(Event->EventInfo, Queue->AsyncInfo)) @@ -649,7 +771,7 @@ Error olSyncEvent_impl(ol_event_handle_t Event) { if (!Event->EventInfo) return Plugin::success(); - if (auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo)) + if (auto Res = Event->Device->Device->syncEvent(Event->EventInfo)) return Res; return Error::success(); @@ -657,7 +779,7 @@ Error olSyncEvent_impl(ol_event_handle_t Event) { Error olDestroyEvent_impl(ol_event_handle_t Event) { if (Event->EventInfo) - if (auto Res = Event->Queue->Device->Device->destroyEvent(Event->EventInfo)) + if (auto Res = Event->Device->Device->destroyEvent(Event->EventInfo)) return Res; return olDestroy(Event); @@ -708,7 +830,7 @@ Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) { if (auto Err = Pending.takeError()) return Err; - *EventOut = new ol_event_impl_t(nullptr, Queue); + *EventOut = new ol_event_impl_t(nullptr, Queue->Device, Queue); if (!*Pending) // Queue is empty, don't record an event and consider the event always // complete diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index 6585286bf428..71423ae0c94d 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -37,6 +37,8 @@ using namespace llvm::omp::target::ompt; #endif +using namespace llvm::omp::target::plugin; + int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const { // First, check if the user disabled atomic map transfer/malloc/dealloc. @@ -97,7 +99,55 @@ llvm::Error DeviceTy::init() { return llvm::Error::success(); } -// Load binary to device. +// Extract the mapping of host function pointers to device function pointers +// from the entry table. Functions marked as 'indirect' in OpenMP will have +// offloading entries generated for them which map the host's function pointer +// to a global containing the corresponding function pointer on the device. +static llvm::Expected<std::pair<void *, uint64_t>> +setupIndirectCallTable(DeviceTy &Device, __tgt_device_image *Image, + __tgt_device_binary Binary) { + AsyncInfoTy AsyncInfo(Device); + llvm::ArrayRef<llvm::offloading::EntryTy> Entries(Image->EntriesBegin, + Image->EntriesEnd); + llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable; + for (const auto &Entry : Entries) { + if (Entry.Kind != llvm::object::OffloadKind::OFK_OpenMP || + Entry.Size == 0 || !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT)) + continue; + + assert(Entry.Size == sizeof(void *) && "Global not a function pointer?"); + auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); + + void *Ptr; + if (Device.RTL->get_global(Binary, Entry.Size, Entry.SymbolName, &Ptr)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to load %s", Entry.SymbolName); + + HstPtr = Entry.Address; + if (Device.retrieveData(&DevPtr, Ptr, Entry.Size, AsyncInfo)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to load %s", Entry.SymbolName); + } + + // If we do not have any indirect globals we exit early. + if (IndirectCallTable.empty()) + return std::pair{nullptr, 0}; + + // Sort the array to allow for more efficient lookup of device pointers. + llvm::sort(IndirectCallTable, + [](const auto &x, const auto &y) { return x.first < y.first; }); + + uint64_t TableSize = + IndirectCallTable.size() * sizeof(std::pair<void *, void *>); + void *DevicePtr = Device.allocData(TableSize, nullptr, TARGET_ALLOC_DEVICE); + if (Device.submitData(DevicePtr, IndirectCallTable.data(), TableSize, + AsyncInfo)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to copy data"); + return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size()); +} + +// Load binary to device and perform global initialization if needed. llvm::Expected<__tgt_device_binary> DeviceTy::loadBinary(__tgt_device_image *Img) { __tgt_device_binary Binary; @@ -105,6 +155,38 @@ DeviceTy::loadBinary(__tgt_device_image *Img) { if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS) return error::createOffloadError(error::ErrorCode::INVALID_BINARY, "failed to load binary %p", Img); + + // This symbol is optional. + void *DeviceEnvironmentPtr; + if (RTL->get_global(Binary, sizeof(DeviceEnvironmentTy), + "__omp_rtl_device_environment", &DeviceEnvironmentPtr)) + return Binary; + + // Obtain a table mapping host function pointers to device function pointers. + auto CallTablePairOrErr = setupIndirectCallTable(*this, Img, Binary); + if (!CallTablePairOrErr) + return CallTablePairOrErr.takeError(); + + GenericDeviceTy &GenericDevice = RTL->getDevice(RTLDeviceID); + DeviceEnvironmentTy DeviceEnvironment; + DeviceEnvironment.DeviceDebugKind = GenericDevice.getDebugKind(); + DeviceEnvironment.NumDevices = RTL->getNumDevices(); + // TODO: The device ID used here is not the real device ID used by OpenMP. + DeviceEnvironment.DeviceNum = RTLDeviceID; + DeviceEnvironment.DynamicMemSize = GenericDevice.getDynamicMemorySize(); + DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency(); + DeviceEnvironment.IndirectCallTable = + reinterpret_cast<uintptr_t>(CallTablePairOrErr->first); + DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second; + DeviceEnvironment.HardwareParallelism = + GenericDevice.getHardwareParallelism(); + + AsyncInfoTy AsyncInfo(*this); + if (submitData(DeviceEnvironmentPtr, &DeviceEnvironment, + sizeof(DeviceEnvironment), AsyncInfo)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to copy data"); + return Binary; } diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index 32e89cc75efc..4c8eba1e7180 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -403,6 +403,12 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo, reinterpret_cast<uint64_t>(HstPteeBase); void *TgtPteeBase = reinterpret_cast<void *>( reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta); + DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD + ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n", + DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta); + DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD + "\n", + DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin)); // Add shadow pointer tracking // TODO: Support shadow-tracking of larger than VoidPtrSize pointers, diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h index 0b3d54599048..90e5e1780e66 100644 --- a/offload/libomptarget/private.h +++ b/offload/libomptarget/private.h @@ -55,7 +55,14 @@ printKernelArguments(const ident_t *Loc, const int64_t DeviceId, const char *Type = nullptr; const char *Implicit = (ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT) ? "(implicit)" : ""; - if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) + + if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH && + ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS) + Type = "attach:always"; + else if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) + Type = "attach"; + else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && + ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) Type = "tofrom"; else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) Type = "to"; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 7ba55715ff58..c26cfe961aa0 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -924,6 +924,7 @@ private: void *Dst; const void *Src; size_t Size; + size_t NumTimes; }; /// Utility struct holding arguments for freeing buffers to memory managers. @@ -974,9 +975,14 @@ private: StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {} /// Schedule a host memory copy action on the slot. - Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) { + /// + /// Num times will repeat the copy that many times, sequentually in the dest + /// buffer. + Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size, + size_t NumTimes = 1) { Callbacks.emplace_back(memcpyAction); - ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size}; + ActionArgs.emplace_back().MemcpyArgs = + MemcpyArgsTy{Dst, Src, Size, NumTimes}; return Plugin::success(); } @@ -1216,7 +1222,11 @@ private: assert(Args->Dst && "Invalid destination buffer"); assert(Args->Src && "Invalid source buffer"); - std::memcpy(Args->Dst, Args->Src, Args->Size); + auto BasePtr = Args->Dst; + for (size_t I = 0; I < Args->NumTimes; I++) { + std::memcpy(BasePtr, Args->Src, Args->Size); + BasePtr = reinterpret_cast<uint8_t *>(BasePtr) + Args->Size; + } return Plugin::success(); } @@ -1421,7 +1431,8 @@ public: /// manager once the operation completes. Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, uint64_t CopySize, - AMDGPUMemoryManagerTy &MemoryManager) { + AMDGPUMemoryManagerTy &MemoryManager, + size_t NumTimes = 1) { // Retrieve available signals for the operation's outputs. AMDGPUSignalTy *OutputSignals[2] = {}; if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) @@ -1443,7 +1454,8 @@ public: // The std::memcpy is done asynchronously using an async handler. We store // the function's information in the action but it is not actually a // post action. - if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize)) + if (auto Err = + Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes)) return Err; // Make changes on this slot visible to the async handler's thread. @@ -1464,7 +1476,11 @@ public: std::tie(Curr, InputSignal) = consume(OutputSignal); } else { // All preceding operations completed, copy the memory synchronously. - std::memcpy(Inter, Src, CopySize); + auto *InterPtr = Inter; + for (size_t I = 0; I < NumTimes; I++) { + std::memcpy(InterPtr, Src, CopySize); + InterPtr = reinterpret_cast<uint8_t *>(InterPtr) + CopySize; + } // Return the second signal because it will not be used. OutputSignals[1]->decreaseUseCount(); @@ -1481,11 +1497,11 @@ public: if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + Agent, CopySize * NumTimes, 1, + &InputSignalRaw, OutputSignal->get()); } return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 0, nullptr, + Agent, CopySize * NumTimes, 0, nullptr, OutputSignal->get()); } @@ -2611,26 +2627,73 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { - hsa_status_t Status; + // Fast case, where we can use the 4 byte hsa_amd_memory_fill + if (Size % 4 == 0 && + (PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) { + uint32_t Pattern; + if (PatternSize == 1) { + auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr); + Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24; + } else if (PatternSize == 2) { + auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr); + Pattern = *Word | (*Word << 16); + } else if (PatternSize == 4) { + Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr); + } else { + // Shouldn't be here if the pattern size is outwith those values + llvm_unreachable("Invalid pattern size"); + } - // We can use hsa_amd_memory_fill for this size, but it's not async so the - // queue needs to be synchronized first - if (PatternSize == 4) { - if (AsyncInfoWrapper.hasQueue()) - if (auto Err = synchronize(AsyncInfoWrapper)) + if (hasPendingWorkImpl(AsyncInfoWrapper)) { + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; - Status = hsa_amd_memory_fill(TgtPtr, - *static_cast<const uint32_t *>(PatternPtr), - Size / PatternSize); - if (auto Err = - Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n")) - return Err; - } else { - // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned - // memory and copying to the device in one go. - return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size"); + struct MemFillArgsTy { + void *Dst; + uint32_t Pattern; + int64_t Size; + }; + auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4}; + auto Fill = [](void *Data) { + MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data); + assert(Args && "Invalid arguments"); + + auto Status = + hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size); + delete Args; + auto Err = + Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"); + if (Err) { + FATAL_MESSAGE(1, "error performing async fill: %s", + toString(std::move(Err)).data()); + } + }; + + // hsa_amd_memory_fill doesn't signal completion using a signal, so use + // the existing host callback logic to handle that instead + return Stream->pushHostCallback(Fill, Args); + } else { + // If there is no pending work, do the fill synchronously + auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4); + return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"); + } } + + // Slow case; allocate an appropriate memory size and enqueue copies + void *PinnedPtr = nullptr; + AMDGPUMemoryManagerTy &PinnedMemoryManager = + HostDevice.getPinnedMemoryManager(); + if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) + return Err; + + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr, + PatternSize, PinnedMemoryManager, + Size / PatternSize); } /// Initialize the async info for interoperability purposes. @@ -2744,7 +2807,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Product Name", TmpChar); + Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME); Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) @@ -2861,11 +2924,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); if (Status == HSA_STATUS_SUCCESS) - Info.add("Grid Max Size", TmpUInt); + Info.add("Grid Max Size", TmpUInt, "", DeviceInfo::MAX_WORK_SIZE); Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); if (Status == HSA_STATUS_SUCCESS) { - auto &MaxDim = *Info.add("Grid Max Size per Dimension"); + auto &MaxDim = *Info.add("Grid Max Size per Dimension", std::monostate{}, + "", DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION); MaxDim.add("x", GridMaxDim.x); MaxDim.add("y", GridMaxDim.y); MaxDim.add("z", GridMaxDim.z); diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 75f87cab6049..6ff3ef8cda17 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -417,6 +417,7 @@ struct GenericKernelTy { case OMP_TGT_EXEC_MODE_SPMD: case OMP_TGT_EXEC_MODE_GENERIC: case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP: return true; } return false; @@ -434,6 +435,8 @@ protected: return "Generic"; case OMP_TGT_EXEC_MODE_GENERIC_SPMD: return "Generic-SPMD"; + case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP: + return "SPMD-No-Loop"; } llvm_unreachable("Unknown execution mode!"); } @@ -471,7 +474,8 @@ private: uint32_t BlockLimitClause[3], uint64_t LoopTripCount, uint32_t &NumThreads, bool IsNumThreadsFromUser) const; - /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. + /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop + /// or SPMD mode. bool isGenericSPMDMode() const { return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD; @@ -486,6 +490,10 @@ private: bool isBareMode() const { return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE; } + bool isNoLoopMode() const { + return KernelEnvironment.Configuration.ExecMode == + OMP_TGT_EXEC_MODE_SPMD_NO_LOOP; + } /// The kernel name. std::string Name; @@ -831,11 +839,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { Error unloadBinary(DeviceImageTy *Image); virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0; - /// Setup the device environment if needed. Notice this setup may not be run - /// on some plugins. By default, it will be executed, but plugins can change - /// this behavior by overriding the shouldSetupDeviceEnvironment function. - Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); - /// Setup the global device memory pool, if the plugin requires one. Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, uint64_t PoolSize); @@ -1035,6 +1038,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy { uint32_t getDefaultNumBlocks() const { return GridValues.GV_Default_Num_Teams; } + uint32_t getDebugKind() const { return OMPX_DebugKind; } uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } @@ -1175,11 +1179,6 @@ private: virtual Error getDeviceHeapSize(uint64_t &V) = 0; virtual Error setDeviceHeapSize(uint64_t V) = 0; - /// Indicate whether the device should setup the device environment. Notice - /// that returning false in this function will change the behavior of the - /// setupDeviceEnvironment() function. - virtual bool shouldSetupDeviceEnvironment() const { return true; } - /// Indicate whether the device should setup the global device memory pool. If /// false is return the value on the device will be uninitialized. virtual bool shouldSetupDeviceMemoryPool() const { return true; } @@ -1235,7 +1234,7 @@ protected: enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING }; /// Array of peer access states with the rest of devices. This means that if - /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE, + /// the device I has a matrix PeerAccesses with PeerAccesses == AVAILABLE, /// the device I can access device J's memory directly. However, notice this /// does not mean that device J can access device I's memory directly. llvm::SmallVector<PeerAccessState> PeerAccesses; diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index d4b5f914c667..36cdd6035e26 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -371,54 +371,6 @@ public: }; } // namespace llvm::omp::target::plugin -// Extract the mapping of host function pointers to device function pointers -// from the entry table. Functions marked as 'indirect' in OpenMP will have -// offloading entries generated for them which map the host's function pointer -// to a global containing the corresponding function pointer on the device. -static Expected<std::pair<void *, uint64_t>> -setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device, - DeviceImageTy &Image) { - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - - llvm::ArrayRef<llvm::offloading::EntryTy> Entries( - Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd); - llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable; - for (const auto &Entry : Entries) { - if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 || - !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT)) - continue; - - assert(Entry.Size == sizeof(void *) && "Global not a function pointer?"); - auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); - - GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size); - if (auto Err = - Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) - return std::move(Err); - - HstPtr = Entry.Address; - if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(), - Entry.Size, nullptr)) - return std::move(Err); - } - - // If we do not have any indirect globals we exit early. - if (IndirectCallTable.empty()) - return std::pair{nullptr, 0}; - - // Sort the array to allow for more efficient lookup of device pointers. - llvm::sort(IndirectCallTable, - [](const auto &x, const auto &y) { return x.first < y.first; }); - - uint64_t TableSize = - IndirectCallTable.size() * sizeof(std::pair<void *, void *>); - void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE); - if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(), - TableSize, nullptr)) - return std::move(Err); - return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size()); -} - AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr) : Device(Device), @@ -662,6 +614,10 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit()); } + // Return the number of teams required to cover the loop iterations. + if (isNoLoopMode()) + return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1; + uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks(); uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max(); if (LoopTripCount > 0) { @@ -939,10 +895,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, // Add the image to list. LoadedImages.push_back(Image); - // Setup the device environment if needed. - if (auto Err = setupDeviceEnvironment(Plugin, *Image)) - return std::move(Err); - // Setup the global device memory pool if needed. if (!Plugin.getRecordReplay().isReplaying() && shouldSetupDeviceMemoryPool()) { @@ -978,43 +930,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return Image; } -Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, - DeviceImageTy &Image) { - // There are some plugins that do not need this step. - if (!shouldSetupDeviceEnvironment()) - return Plugin::success(); - - // Obtain a table mapping host function pointers to device function pointers. - auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image); - if (!CallTablePairOrErr) - return CallTablePairOrErr.takeError(); - - DeviceEnvironmentTy DeviceEnvironment; - DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind; - DeviceEnvironment.NumDevices = Plugin.getNumDevices(); - // TODO: The device ID used here is not the real device ID used by OpenMP. - DeviceEnvironment.DeviceNum = DeviceId; - DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; - DeviceEnvironment.ClockFrequency = getClockFrequency(); - DeviceEnvironment.IndirectCallTable = - reinterpret_cast<uintptr_t>(CallTablePairOrErr->first); - DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second; - DeviceEnvironment.HardwareParallelism = getHardwareParallelism(); - - // Create the metainfo of the device environment global. - GlobalTy DevEnvGlobal("__omp_rtl_device_environment", - sizeof(DeviceEnvironmentTy), &DeviceEnvironment); - - // Write device environment values to the device. - GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); - if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) { - DP("Missing symbol %s, continue execution anyway.\n", - DevEnvGlobal.getName().data()); - consumeError(std::move(Err)); - } - return Plugin::success(); -} - Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, uint64_t PoolSize) { @@ -1337,16 +1252,19 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) { Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue) { + if (!AsyncInfo) + return Plugin::error(ErrorCode::INVALID_ARGUMENT, + "invalid async info queue"); + SmallVector<void *> AllocsToDelete{}; { std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex}; - if (!AsyncInfo || !AsyncInfo->Queue) - return Plugin::error(ErrorCode::INVALID_ARGUMENT, - "invalid async info queue"); - - if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue)) - return Err; + // This can be false when no work has been added to the AsyncInfo. In which + // case, the device has nothing to synchronize. + if (AsyncInfo->Queue) + if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue)) + return Err; std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations); } @@ -2252,8 +2170,7 @@ int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size, GenericGlobalHandlerTy &GHandler = getGlobalHandler(); if (auto Err = GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) { - REPORT("Failure to look up global address: %s\n", - toString(std::move(Err)).data()); + consumeError(std::move(Err)); return OFFLOAD_FAIL; } diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index bf335ab20f75..af3c74636bff 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1060,8 +1060,10 @@ struct CUDADeviceTy : public GenericDeviceTy { Info.add("CUDA OpenMP Device Number", DeviceId); Res = cuDeviceGetName(TmpChar, 1000, Device); - if (Res == CUDA_SUCCESS) + if (Res == CUDA_SUCCESS) { Info.add("Device Name", TmpChar, "", DeviceInfo::NAME); + Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME); + } Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR); @@ -1118,7 +1120,13 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) MaxBlock.add("z", TmpInt); - auto &MaxGrid = *Info.add("Maximum Grid Dimensions", ""); + // TODO: I assume CUDA devices have no limit on the amount of threads, + // verify this + Info.add("Maximum Grid Size", std::numeric_limits<uint32_t>::max(), "", + DeviceInfo::MAX_WORK_SIZE); + + auto &MaxGrid = *Info.add("Maximum Grid Dimensions", std::monostate{}, "", + DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt); if (Res == CUDA_SUCCESS) MaxGrid.add("x", TmpInt); @@ -1444,7 +1452,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem); if (auto Err = Plugin::check( AttrResult, - "Error in cuLaunchKernel while setting the memory limits: %s")) + "error in cuFuncSetAttribute while setting the memory limits: %s")) return Err; MaxDynCGroupMemLimit = MaxDynCGroupMem; } diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index f440ebaf17fe..5436cae3b029 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -387,7 +387,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy { } /// This plugin should not setup the device environment or memory pool. - virtual bool shouldSetupDeviceEnvironment() const override { return false; }; virtual bool shouldSetupDeviceMemoryPool() const override { return false; }; /// Getters and setters for stack size and heap size not relevant. diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index f3e8e9a66685..c0290bfdab3f 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -83,6 +83,7 @@ config.test_format = lit.formats.ShTest() config.test_flags = " -I " + config.test_source_root + \ " -I " + config.omp_header_directory + \ " -L " + config.library_dir + \ + " -L " + config.llvm_library_intdir + \ " -L " + config.llvm_lib_directory # compiler specific flags @@ -165,11 +166,12 @@ else: # Unices config.test_flags += " -nogpulib" config.test_flags += " -Wl,-rpath," + config.library_dir config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory + config.test_flags += " -Wl,-rpath," + config.llvm_library_intdir config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory if config.cuda_libdir: config.test_flags += " -Wl,-rpath," + config.cuda_libdir if config.libomptarget_current_target.startswith('nvptx'): - config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.llvm_library_intdir + config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.llvm_library_intdir + "/nvptx64-nvidia-cuda" if config.libomptarget_current_target.endswith('-LTO'): config.test_flags += " -foffload-lto" if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env( diff --git a/offload/test/mapping/chained_containing_structs_1.cc b/offload/test/mapping/chained_containing_structs_1.cc new file mode 100644 index 000000000000..4dbb17140de1 --- /dev/null +++ b/offload/test/mapping/chained_containing_structs_1.cc @@ -0,0 +1,58 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic +// XFAIL: * + +#include <cstdlib> +#include <cstdio> +#include <cassert> + +struct S { + int a; + int b; + int c; +}; + +struct T { + S *s0; + S *s1; + S *s2; +}; + +int main() { + T *v = (T *) malloc (sizeof(T)); + v->s0 = (S *) malloc (sizeof(S)); + v->s1 = (S *) malloc (sizeof(S)); + v->s2 = (S *) malloc (sizeof(S)); + v->s0->a = 10; + v->s0->b = 10; + v->s0->c = 10; + v->s1->a = 20; + v->s1->b = 20; + v->s1->c = 20; + v->s2->a = 30; + v->s2->b = 30; + v->s2->c = 30; + +#pragma omp target map(to: v[:1]) map(tofrom: v->s1->b, v->s1->c, v->s2->b) + { + v->s1->b += 3; + v->s1->c += 5; + v->s2->b += 7; + } + + printf ("%d\n", v->s0->a); // CHECK: 10 + printf ("%d\n", v->s0->b); // CHECK: 10 + printf ("%d\n", v->s0->c); // CHECK: 10 + printf ("%d\n", v->s1->a); // CHECK: 20 + printf ("%d\n", v->s1->b); // CHECK: 23 + printf ("%d\n", v->s1->c); // CHECK: 25 + printf ("%d\n", v->s2->a); // CHECK: 30 + printf ("%d\n", v->s2->b); // CHECK: 37 + printf ("%d\n", v->s2->c); // CHECK: 30 + + free(v->s0); + free(v->s1); + free(v->s2); + free(v); + + return 0; +} diff --git a/offload/test/mapping/chained_containing_structs_2.cc b/offload/test/mapping/chained_containing_structs_2.cc new file mode 100644 index 000000000000..29c4c8b7fedf --- /dev/null +++ b/offload/test/mapping/chained_containing_structs_2.cc @@ -0,0 +1,76 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic +// XFAIL: * + +#include <cstdlib> +#include <cstdio> +#include <cassert> + +struct R { + int d; + int e; + int f; +}; + +struct S { + R *r0; + R *r1; + R *r2; +}; + +struct T { + S *s0; + S *s1; + S *s2; +}; + +int main() { + T *v = (T *) malloc (sizeof(T)); + + v->s0 = (S *) malloc (sizeof(S)); + v->s1 = (S *) malloc (sizeof(S)); + v->s2 = (S *) malloc (sizeof(S)); + + v->s0->r0 = (R *) calloc (1, sizeof(R)); + v->s0->r1 = (R *) calloc (1, sizeof(R)); + v->s0->r2 = (R *) calloc (1, sizeof(R)); + + v->s1->r0 = (R *) calloc (1, sizeof(R)); + v->s1->r1 = (R *) calloc (1, sizeof(R)); + v->s1->r2 = (R *) calloc (1, sizeof(R)); + + v->s2->r0 = (R *) calloc (1, sizeof(R)); + v->s2->r1 = (R *) calloc (1, sizeof(R)); + v->s2->r2 = (R *) calloc (1, sizeof(R)); + + #pragma omp target map(to: v->s1, v->s2, *v->s1, v->s1->r1, *v->s2, v->s2->r0) \ + map(tofrom: v->s1->r1->d, v->s1->r1->e, v->s1->r2->d, v->s1->r2->f, v->s2->r0->e) + { + v->s1->r1->d += 3; + v->s1->r1->e += 5; + v->s1->r2->d += 7; + v->s1->r2->f += 9; + v->s2->r0->e += 11; + } + + printf ("%d\n", v->s1->r1->d); // CHECK: 3 + printf ("%d\n", v->s1->r1->e); // CHECK: 5 + printf ("%d\n", v->s1->r2->d); // CHECK: 7 + printf ("%d\n", v->s1->r2->f); // CHECK: 9 + printf ("%d\n", v->s2->r0->e); // CHECK: 11 + + free(v->s0->r0); + free(v->s0->r1); + free(v->s0->r2); + free(v->s1->r0); + free(v->s1->r1); + free(v->s1->r2); + free(v->s2->r0); + free(v->s2->r1); + free(v->s2->r2); + free(v->s0); + free(v->s1); + free(v->s2); + free(v); + + return 0; +} diff --git a/offload/test/mapping/chained_containing_structs_3.cc b/offload/test/mapping/chained_containing_structs_3.cc new file mode 100644 index 000000000000..23555bf69110 --- /dev/null +++ b/offload/test/mapping/chained_containing_structs_3.cc @@ -0,0 +1,217 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <cstdlib> +#include <cstdio> +#include <cassert> +#include <cstring> + +#include <omp.h> + +struct R { + int d; + int e; + int f; +}; + +struct S { + int a; + int b; + struct { + int c; + R r; + R *rp; + } sub; + int g; +}; + +struct T { + int a; + int *ptr; + int b; +}; + +int main() { + R r; + R *rp = new R; + S s; + S *sp = new S; + T t; + T *tp = new T; + + memset(&r, 0, sizeof(R)); + memset(rp, 0, sizeof(R)); + memset(&s, 0, sizeof(S)); + memset(sp, 0, sizeof(S)); + memset(&t, 0, sizeof(T)); + memset(tp, 0, sizeof(T)); + + s.sub.rp = new R; + sp->sub.rp = new R; + + memset(s.sub.rp, 0, sizeof(R)); + memset(sp->sub.rp, 0, sizeof(R)); + + t.ptr = new int[10]; + tp->ptr = new int[10]; + + memset(t.ptr, 0, sizeof(int)*10); + memset(tp->ptr, 0, sizeof(int)*10); + +#pragma omp target map(tofrom: r) map(tofrom: r.e) +{ + r.d++; + r.e += 2; + r.f += 3; +} + printf ("%d\n", r.d); // CHECK: 1 + printf ("%d\n", r.e); // CHECK-NEXT: 2 + printf ("%d\n", r.f); // CHECK-NEXT: 3 + +#pragma omp target map(tofrom: rp[:1]) map(tofrom: rp->e) +{ + rp->d++; + rp->e += 2; + rp->f += 3; +} + + printf ("%d\n", rp->d); // CHECK-NEXT: 1 + printf ("%d\n", rp->e); // CHECK-NEXT: 2 + printf ("%d\n", rp->f); // CHECK-NEXT: 3 + + int v; + int *orig_addr_v = &v; + bool separate_memory_space; + +#pragma omp target data map(v) + { + void *mapped_ptr_v = + omp_get_mapped_ptr(orig_addr_v, omp_get_default_device()); + separate_memory_space = mapped_ptr_v != (void*) orig_addr_v; + } + + const char *mapping_flavour = separate_memory_space ? "separate" : "unified"; + +#pragma omp target map(to: s) map(tofrom: s.sub.r.e) +{ + s.b++; + s.sub.r.d+=2; + s.sub.r.e+=3; + s.sub.r.f+=4; +} + + printf ("%d/%s\n", s.b, mapping_flavour); + printf ("%d/%s\n", s.sub.r.d, mapping_flavour); + printf ("%d/%s\n", s.sub.r.e, mapping_flavour); + printf ("%d/%s\n", s.sub.r.f, mapping_flavour); + + // CHECK: {{0/separate|1/unified}} + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: 3 + // CHECK-NEXT: {{0/separate|4/unified}} + +#pragma omp target map(to: s, s.b) map(to: s.sub.rp[:1]) map(tofrom: s.sub.rp->e) +{ + s.b++; + s.sub.rp->d+=2; + s.sub.rp->e+=3; + s.sub.rp->f+=4; +} + + printf ("%d/%s\n", s.b, mapping_flavour); + printf ("%d/%s\n", s.sub.rp->d, mapping_flavour); + printf ("%d/%s\n", s.sub.rp->e, mapping_flavour); + printf ("%d/%s\n", s.sub.rp->f, mapping_flavour); + + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: 3 + // CHECK-NEXT: {{0/separate|4/unified}} + +#pragma omp target map(to: sp[:1]) map(tofrom: sp->sub.r.e) +{ + sp->b++; + sp->sub.r.d+=2; + sp->sub.r.e+=3; + sp->sub.r.f+=4; +} + + printf ("%d/%s\n", sp->b, mapping_flavour); + printf ("%d/%s\n", sp->sub.r.d, mapping_flavour); + printf ("%d/%s\n", sp->sub.r.e, mapping_flavour); + printf ("%d/%s\n", sp->sub.r.f, mapping_flavour); + + // CHECK-NEXT: {{0/separate|1/unified}} + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: 3 + // CHECK-NEXT: {{0/separate|4/unified}} + +#pragma omp target map(to: sp[:1]) map(to: sp->sub.rp[:1]) map(tofrom: sp->sub.rp->e) +{ + sp->b++; + sp->sub.rp->d+=2; + sp->sub.rp->e+=3; + sp->sub.rp->f+=4; +} + + printf ("%d/%s\n", sp->b, mapping_flavour); + printf ("%d/%s\n", sp->sub.rp->d, mapping_flavour); + printf ("%d/%s\n", sp->sub.rp->e, mapping_flavour); + printf ("%d/%s\n", sp->sub.rp->f, mapping_flavour); + + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: 3 + // CHECK-NEXT: {{0/separate|4/unified}} + +#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1]) +{ + t.a++; + t.ptr[2]+=2; + t.b+=3; +} + + printf ("%d\n", t.a); // CHECK-NEXT: 1 + printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 2 + printf ("%d\n", t.b); // CHECK-NEXT: 3 + +#pragma omp target map(tofrom: t) map(tofrom: t.a) +{ + t.b++; +} + + printf ("%d\n", t.b); // CHECK-NEXT: 4 + +#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1], t.a) +{ + t.a++; + t.ptr[2]+=2; + t.b+=3; +} + + printf ("%d\n", t.a); // CHECK-NEXT: 2 + printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 4 + printf ("%d\n", t.b); // CHECK-NEXT: 7 + +#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1], t.a) +{ + /* Empty */ +} + + printf ("%d\n", t.a); // CHECK-NEXT: 2 + printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 4 + printf ("%d\n", t.b); // CHECK-NEXT: 7 + + delete s.sub.rp; + delete sp->sub.rp; + + delete[] t.ptr; + delete[] tp->ptr; + + delete rp; + delete sp; + delete tp; + + return 0; +} diff --git a/offload/test/mapping/map_ptr_and_star_global.c b/offload/test/mapping/map_ptr_and_star_global.c index c3b0dd2f49e6..869fb8ca9bc2 100644 --- a/offload/test/mapping/map_ptr_and_star_global.c +++ b/offload/test/mapping/map_ptr_and_star_global.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_ptr_and_star_local.c b/offload/test/mapping/map_ptr_and_star_local.c index f0ca84d1cc4d..cc826b3c0290 100644 --- a/offload/test/mapping/map_ptr_and_star_local.c +++ b/offload/test/mapping/map_ptr_and_star_local.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_ptr_and_subscript_global.c b/offload/test/mapping/map_ptr_and_subscript_global.c index a3a10b6c9b21..839db068aa90 100644 --- a/offload/test/mapping/map_ptr_and_subscript_global.c +++ b/offload/test/mapping/map_ptr_and_subscript_global.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_ptr_and_subscript_local.c b/offload/test/mapping/map_ptr_and_subscript_local.c index bb44999541a7..68ac9dc0917f 100644 --- a/offload/test/mapping/map_ptr_and_subscript_local.c +++ b/offload/test/mapping/map_ptr_and_subscript_local.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_structptr_and_member_global.c b/offload/test/mapping/map_structptr_and_member_global.c index 10e72e070dbc..960eea419964 100644 --- a/offload/test/mapping/map_structptr_and_member_global.c +++ b/offload/test/mapping/map_structptr_and_member_global.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_structptr_and_member_local.c b/offload/test/mapping/map_structptr_and_member_local.c index 9e59551ad3d6..bd759407ef09 100644 --- a/offload/test/mapping/map_structptr_and_member_local.c +++ b/offload/test/mapping/map_structptr_and_member_local.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp new file mode 100644 index 000000000000..3b1a8192bf2c --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp @@ -0,0 +1,85 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5]) + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa02 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa02 != mapped_ptr_paa02); + +// (A) use_device_addr operand within mapped address range. +// CHECK: A: 1 +#pragma omp target data use_device_addr(ph[3 : 4]) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_addr operand in extended address range, but not +// mapped address range. +// CHECK: B: 1 +#pragma omp target data use_device_addr(ph[2]) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) use_device_addr/map: same base-array, different first-location. +// CHECK: C: 1 +#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1]) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) use_device_addr/map: different base-array/pointers. +// CHECK: D: 1 +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) use_device_addr operand within mapped range of previous map. +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa[0]) + printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (F) use_device_addr/map: different operands, same base-array. +// CHECK: F: 1 +#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2]) + printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (G) use_device_addr/map: different base-array/pointers. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2]) + printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp new file mode 100644 index 000000000000..b9ebde431e7b --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp @@ -0,0 +1,143 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +// (A) No corresponding map, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (B) use_device_addr/map: different operands, same base-pointer. +// use_device_addr operand within mapped address range. +// CHECK: B: 1 1 1 +#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1]) + { + int *mapped_ptr_ph4 = + (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr, + mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4); + } + +// (C) use_device_addr/map: different base-pointers. +// No corresponding storage, lookup should fail. +// CHECK: C: 1 1 1 +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (D) use_device_addr/map: one of two maps with matching base-pointer. +// use_device_addr operand within mapped address range of second map, +// lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding map, lookup should fail +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == (int **)nullptr + 2); + } + +// (F) use_device_addr/map: different operands, same base-array. +// use_device_addr within mapped address range. Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + +// (G) use_device_addr/map: different operands, same base-array. +// use_device_addr extends beyond existing mapping. Not spec compliant. +// But the lookup succeeds because we use the base-address for translation. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[0][4]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr( + original_paa02 + 2, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr, + mapped_ptr_paa04 != original_paa02 + 2, + &paa[0][4] == mapped_ptr_paa04); + } + + int *original_paa020 = &paa[0][2][0]; + int **original_paa0 = (int **)&paa[0]; + +// (H) use_device_addr/map: different base-pointers. +// No corresponding storage for use_device_addr opnd, lookup should fail. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa020 = + (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device()); + int **mapped_ptr_paa0 = + (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr, + mapped_ptr_paa0 == nullptr, &paa[0] == nullptr); + } + +// (I) use_device_addr/map: one map with different, one with same base-ptr. +// Lookup should succeed. +// CHECK: I: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp new file mode 100644 index 000000000000..e9a1124bc461 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp @@ -0,0 +1,98 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section on a reference. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5]) + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa02 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa02 != mapped_ptr_paa02); + +// (A) use_device_addr operand within mapped address range. +// EXPECTED: A: 1 +// CHECK: A: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_addr(ph[3 : 4]) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_addr operand in extended address range, but not +// mapped address range. +// EXPECTED: B: 1 +// CHECK: B: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_addr(ph[2]) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) use_device_addr/map: same base-array, different first-location. +// EXPECTED: C: 1 +// CHECK: C: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1]) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) use_device_addr/map: different base-array/pointers. +// EXPECTED: D: 1 +// CHECK: D: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) use_device_addr operand within mapped range of previous map. +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa[0]) + printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (F) use_device_addr/map: different operands, same base-array. +// CHECK: F: 1 +#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2]) + printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (G) use_device_addr/map: different base-array/pointers. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2]) + printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp new file mode 100644 index 000000000000..0090cdb09536 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp @@ -0,0 +1,158 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section on a reference. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +// (A) No corresponding map, lookup should fail. +// EXPECTED: A: 1 1 1 +// CHECK: A: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (B) use_device_addr/map: different operands, same base-pointer. +// use_device_addr operand within mapped address range. +// EXPECTED: B: 1 1 1 +// CHECK: B: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1]) + { + int *mapped_ptr_ph4 = + (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr, + mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4); + } + +// (C) use_device_addr/map: different base-pointers. +// No corresponding storage, lookup should fail. +// EXPECTED: C: 1 1 1 +// CHECK: C: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (D) use_device_addr/map: one of two maps with matching base-pointer. +// use_device_addr operand within mapped address range of second map, +// lookup should succeed. +// EXPECTED: D: 1 1 1 +// CHECK: D: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding map, lookup should fail +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == (int **)nullptr + 2); + } + +// (F) use_device_addr/map: different operands, same base-array. +// use_device_addr within mapped address range. Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + +// (G) use_device_addr/map: different operands, same base-array. +// use_device_addr extends beyond existing mapping. Not spec compliant. +// But the lookup succeeds because we use the base-address for translation. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[0][4]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr( + original_paa02 + 2, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr, + mapped_ptr_paa04 != original_paa02 + 2, + &paa[0][4] == mapped_ptr_paa04); + } + + int *original_paa020 = &paa[0][2][0]; + int **original_paa0 = (int **)&paa[0]; + +// (H) use_device_addr/map: different base-pointers. +// No corresponding storage for use_device_addr opnd, lookup should fail. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa020 = + (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device()); + int **mapped_ptr_paa0 = + (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr, + mapped_ptr_paa0 == nullptr, &paa[0] == nullptr); + } + +// (I) use_device_addr/map: one map with different, one with same base-ptr. +// Lookup should succeed. +// CHECK: I: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp new file mode 100644 index 000000000000..883297f7e90c --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp @@ -0,0 +1,93 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a variable (not a section). +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +#pragma omp target enter data map(to : g, h, ph, paa) + void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device()); + void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device()); + void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device()); + void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device()); + + // CHECK-COUNT-8: 1 + printf("%d\n", mapped_ptr_g != nullptr); + printf("%d\n", mapped_ptr_h != nullptr); + printf("%d\n", mapped_ptr_ph != nullptr); + printf("%d\n", mapped_ptr_paa != nullptr); + printf("%d\n", original_addr_g != mapped_ptr_g); + printf("%d\n", original_addr_h != mapped_ptr_h); + printf("%d\n", original_addr_ph != mapped_ptr_ph); + printf("%d\n", original_addr_paa != mapped_ptr_paa); + +// (A) +// CHECK: A: 1 +#pragma omp target data use_device_addr(g) + printf("A: %d\n", mapped_ptr_g == &g); + +// (B) +// CHECK: B: 1 +#pragma omp target data use_device_addr(h) + printf("B: %d\n", mapped_ptr_h == &h); + +// (C) +// CHECK: C: 1 +#pragma omp target data use_device_addr(ph) + printf("C: %d\n", mapped_ptr_ph == &ph); + +// (D) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &ph, not &ph[0/1]. +// CHECK: D: 1 +#pragma omp target data map(ph[1 : 2]) use_device_addr(ph) + printf("D: %d\n", mapped_ptr_ph == &ph); + +// (E) +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa) + printf("E: %d\n", mapped_ptr_paa == &paa); + +// (F) use_device_addr/map with same base-array, paa. +// Address translation should happen for &paa. +// CHECK: F: 1 +#pragma omp target data map(paa[0][2]) use_device_addr(paa) + printf("F: %d\n", mapped_ptr_paa == &paa); + +// (G) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &paa. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + printf("G: %d\n", mapped_ptr_paa == &paa); + +#pragma omp target exit data map(release : g, h, ph, paa) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp new file mode 100644 index 000000000000..79c6f69edba8 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp @@ -0,0 +1,159 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a variable (not a section). +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +// (A) No corresponding item, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_g == nullptr, + mapped_ptr_g != original_addr_g, (void *)&g == nullptr); + } + +// (B) Lookup should succeed. +// CHECK: B: 1 1 1 +#pragma omp target data map(g) use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_g != nullptr, + mapped_ptr_g != original_addr_g, &g == mapped_ptr_g); + } + +// (C) No corresponding item, lookup should fail. +// CHECK: C: 1 1 1 +#pragma omp target data use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_h == nullptr, + mapped_ptr_h != original_addr_h, (void *)&h == nullptr); + } + +// (D) Lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(h) use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_h != nullptr, + mapped_ptr_h != original_addr_h, &h == mapped_ptr_h); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (F) Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (G) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: G: 1 1 1 +#pragma omp target data map(ph[0 : 1]) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (H) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (I) No corresponding item, lookup should fail. +// CHECK: I: 1 1 1 +#pragma omp target data use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (J) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: J: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("J: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (K) Lookup should succeed. +// CHECK: K: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("K: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + +// (L) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: L: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("L: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp new file mode 100644 index 000000000000..f018c65f36ec --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp @@ -0,0 +1,100 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a reference variable. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +#pragma omp target enter data map(to : g, h, ph, paa) + void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device()); + void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device()); + void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device()); + void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device()); + + // CHECK-COUNT-8: 1 + printf("%d\n", mapped_ptr_g != nullptr); + printf("%d\n", mapped_ptr_h != nullptr); + printf("%d\n", mapped_ptr_ph != nullptr); + printf("%d\n", mapped_ptr_paa != nullptr); + printf("%d\n", original_addr_g != mapped_ptr_g); + printf("%d\n", original_addr_h != mapped_ptr_h); + printf("%d\n", original_addr_ph != mapped_ptr_ph); + printf("%d\n", original_addr_paa != mapped_ptr_paa); + +// (A) +// CHECK: A: 1 +#pragma omp target data use_device_addr(g) + printf("A: %d\n", mapped_ptr_g == &g); + +// (B) +// CHECK: B: 1 +#pragma omp target data use_device_addr(h) + printf("B: %d\n", mapped_ptr_h == &h); + +// (C) +// CHECK: C: 1 +#pragma omp target data use_device_addr(ph) + printf("C: %d\n", mapped_ptr_ph == &ph); + +// (D) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &ph, not &ph[0/1]. +// CHECK: D: 1 +#pragma omp target data map(ph[1 : 2]) use_device_addr(ph) + printf("D: %d\n", mapped_ptr_ph == &ph); + +// (E) +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa) + printf("E: %d\n", mapped_ptr_paa == &paa); + +// (F) use_device_addr/map with same base-array, paa. +// Address translation should happen for &paa. +// CHECK: F: 1 +#pragma omp target data map(paa[0][2]) use_device_addr(paa) + printf("F: %d\n", mapped_ptr_paa == &paa); + +// (G) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &paa. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + printf("G: %d\n", mapped_ptr_paa == &paa); + +#pragma omp target exit data map(release : g, h, ph, paa) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp new file mode 100644 index 000000000000..9360db419504 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp @@ -0,0 +1,166 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a reference variable. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +// (A) No corresponding item, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_g == nullptr, + mapped_ptr_g != original_addr_g, (void *)&g == nullptr); + } + +// (B) Lookup should succeed. +// CHECK: B: 1 1 1 +#pragma omp target data map(g) use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_g != nullptr, + mapped_ptr_g != original_addr_g, &g == mapped_ptr_g); + } + +// (C) No corresponding item, lookup should fail. +// CHECK: C: 1 1 1 +#pragma omp target data use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_h == nullptr, + mapped_ptr_h != original_addr_h, (void *)&h == nullptr); + } + +// (D) Lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(h) use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_h != nullptr, + mapped_ptr_h != original_addr_h, &h == mapped_ptr_h); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (F) Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (G) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: G: 1 1 1 +#pragma omp target data map(ph[0 : 1]) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (H) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (I) No corresponding item, lookup should fail. +// CHECK: I: 1 1 1 +#pragma omp target data use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (J) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: J: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("J: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (K) Lookup should succeed. +// CHECK: K: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("K: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + +// (L) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: L: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("L: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/target_use_device_addr.c b/offload/test/mapping/use_device_addr/target_use_device_addr.c index 5c2bb8a48f6e..4a9dbe252f76 100644 --- a/offload/test/mapping/target_use_device_addr.c +++ b/offload/test/mapping/use_device_addr/target_use_device_addr.c @@ -12,7 +12,9 @@ int main() { printf("%d, %p\n", xp[1], &xp[1]); #pragma omp target data use_device_addr(xp[1 : 3]) map(tofrom : x) #pragma omp target is_device_ptr(xp) - { xp[1] = 222; } + { + xp[1] = 222; + } // CHECK: 222 printf("%d, %p\n", xp[1], &xp[1]); } diff --git a/offload/test/mapping/target_wrong_use_device_addr.c b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c index 7a5babd69253..28ec6857fa1a 100644 --- a/offload/test/mapping/target_wrong_use_device_addr.c +++ b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c @@ -14,7 +14,7 @@ int main() { // CHECK: host addr=0x[[#%x,HOST_ADDR:]] fprintf(stderr, "host addr=%p\n", x); -#pragma omp target data map(to : x [0:10]) +#pragma omp target data map(to : x[0 : 10]) { // CHECK: omptarget device 0 info: variable x does not have a valid device // counterpart @@ -27,4 +27,3 @@ int main() { return 0; } - diff --git a/offload/test/mapping/array_section_use_device_ptr.c b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c index 86e2875c35c4..4cfcce28c112 100644 --- a/offload/test/mapping/array_section_use_device_ptr.c +++ b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c @@ -20,7 +20,9 @@ int main() { float *A_dev = NULL; #pragma omp target data use_device_ptr(A) - { A_dev = A; } + { + A_dev = A; + } #pragma omp target exit data map(delete : A[FROM : LENGTH]) // CHECK: Success diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp new file mode 100644 index 000000000000..a7745de53298 --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp @@ -0,0 +1,100 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a variable. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int h[10]; +int *ph = &h[0]; + +struct S { + int (*paa)[10][10] = &aa; + + void f1(int i) { + paa--; + void *original_ph3 = &ph[3]; + void *original_paa102 = &paa[1][0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5]) + void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa102 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa102 != mapped_ptr_paa102); + +// (A) Mapped data is within extended address range. Lookup should succeed. +// CHECK: A: 1 +#pragma omp target data use_device_ptr(ph) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_ptr/map on pointer, and pointee already exists. +// Lookup should succeed. +// CHECK: B: 1 +#pragma omp target data map(ph) use_device_ptr(ph) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: C: 1 +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: D: 1 +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) Mapped data is within extended address range. Lookup should succeed. +// Lookup should succeed. +// CHECK: E: 1 +#pragma omp target data use_device_ptr(paa) + printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (F) use_device_ptr/map on pointer, and pointee already exists. +// &paa[0] should be in extended address-range of the existing paa[1][...] +// Lookup should succeed. +// FIXME: However, it currently does not. Might need an RT fix. +// EXPECTED: F: 1 +// CHECK: F: 0 +#pragma omp target data map(paa) use_device_ptr(paa) + printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp new file mode 100644 index 000000000000..fe3cdb56e4ba --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp @@ -0,0 +1,125 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a variable. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int h[10]; +int *ph = &h[0]; + +struct S { + int (*paa)[10][10] = &aa; + + void f1(int i) { + paa--; + void *original_addr_ph3 = &ph[3]; + void *original_addr_paa102 = &paa[1][0][2]; + +// (A) No corresponding item, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (B) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// CHECK: B: 1 1 1 +#pragma omp target data map(ph) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: C: 1 1 1 +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (F) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp new file mode 100644 index 000000000000..66e65de4195a --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp @@ -0,0 +1,111 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a reference variable. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int (*paa_ptee)[10][10] = &aa; + +int h[10]; +int *ph_ptee = &h[0]; +int *&ph = ph_ptee; + +struct S { + int (*&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa--; + void *original_ph3 = &ph[3]; + void *original_paa102 = &paa[1][0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5]) + void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa102 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa102 != mapped_ptr_paa102); + +// (A) Mapped data is within extended address range. Lookup should succeed. +// EXPECTED: A: 1 +// CHECK: A: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_ptr(ph) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_ptr/map on pointer, and pointee already exists. +// Lookup should succeed. +// EXPECTED: B: 1 +// CHECK: B: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_ptr(ph) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: C: 1 +// CHECK: C: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: D: 1 +// CHECK: D: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) Mapped data is within extended address range. Lookup should succeed. +// Lookup should succeed. +// CHECK: E: 1 +#pragma omp target data use_device_ptr(paa) + printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (F) use_device_ptr/map on pointer, and pointee already exists. +// &paa[0] should be in extended address-range of the existing paa[1][...] +// Lookup should succeed. +// FIXME: However, it currently does not. Might need an RT fix. +// EXPECTED: F: 1 +// CHECK: F: 0 +#pragma omp target data map(paa) use_device_ptr(paa) + printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp new file mode 100644 index 000000000000..419ab3eb33d4 --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp @@ -0,0 +1,136 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a reference variable. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int (*paa_ptee)[10][10] = &aa; + +int h[10]; +int *ph_ptee = &h[0]; +int *&ph = ph_ptee; + +struct S { + int (*&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa--; + void *original_addr_ph3 = &ph[3]; + void *original_addr_paa102 = &paa[1][0][2]; + +// (A) No corresponding item, lookup should fail. +// EXPECTED: A: 1 1 1 +// CHECK: A: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (B) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// EXPECTED: B: 1 1 1 +// CHECK: B: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: C: 1 1 1 +// CHECK: C: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: D: 1 1 1 +// CHECK: D: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (F) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/offloading/fortran/dtype-char-array-map-2.f90 b/offload/test/offloading/fortran/dtype-char-array-map-2.f90 new file mode 100644 index 000000000000..f17ea9e53853 --- /dev/null +++ b/offload/test/offloading/fortran/dtype-char-array-map-2.f90 @@ -0,0 +1,25 @@ +! Offloading test that verifies certain type of character string arrays +! map to and from device without problem. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + implicit none + type char_t + CHARACTER(LEN=16), dimension(10,10) :: char_arr + end type char_t + type(char_t) :: dtype_char + +!$omp target enter data map(alloc:dtype_char%char_arr) + +!$omp target + dtype_char%char_arr(2,2) = 'c' +!$omp end target + +!$omp target update from(dtype_char%char_arr) + + + print *, dtype_char%char_arr(2,2) +end program + +!CHECK: c diff --git a/offload/test/offloading/fortran/dtype-char-array-map.f90 b/offload/test/offloading/fortran/dtype-char-array-map.f90 new file mode 100644 index 000000000000..6b72c9e95101 --- /dev/null +++ b/offload/test/offloading/fortran/dtype-char-array-map.f90 @@ -0,0 +1,27 @@ +! Offloading test that verifies certain type of character string arrays +! (in this case allocatable) map to and from device without problem. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + implicit none + type char_t + CHARACTER(LEN=16), dimension(:,:), allocatable :: char_arr + end type char_t + type(char_t) :: dtype_char + + allocate(dtype_char%char_arr(10,10)) + +!$omp target enter data map(alloc:dtype_char%char_arr) + +!$omp target + dtype_char%char_arr(2,2) = 'c' +!$omp end target + +!$omp target update from(dtype_char%char_arr) + + + print *, dtype_char%char_arr(2,2) +end program + +!CHECK: c diff --git a/offload/test/offloading/mandatory_but_no_devices.c b/offload/test/offloading/mandatory_but_no_devices.c index ecdee72acad0..df8a5f3b9278 100644 --- a/offload/test/offloading/mandatory_but_no_devices.c +++ b/offload/test/offloading/mandatory_but_no_devices.c @@ -3,6 +3,47 @@ // device. This behavior is proposed for OpenMP 5.2 in OpenMP spec github // issue 2669. +// AMD Tests +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR=target +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR='target teams' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR='target data map(X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \ +// RUN: -DDIR='target enter data map(to:X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \ +// RUN: -DDIR='target exit data map(from:X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \ +// RUN: -DDIR='target update to(X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \ +// RUN: -DDIR='target update from(X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// Nvidia Tests // RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DDIR=target // RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \ // RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \ @@ -42,8 +83,6 @@ // RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \ // RUN: %fcheck-nvptx64-nvidia-cuda -// REQUIRES: nvptx64-nvidia-cuda - #include <omp.h> #include <stdio.h> diff --git a/offload/test/offloading/memory_manager.cpp b/offload/test/offloading/memory_manager.cpp index fba1e4a54012..d6d8697fcdec 100644 --- a/offload/test/offloading/memory_manager.cpp +++ b/offload/test/offloading/memory_manager.cpp @@ -1,7 +1,5 @@ // RUN: %libomptarget-compilexx-run-and-check-generic -// REQUIRES: nvidiagpu - #include <omp.h> #include <cassert> diff --git a/offload/test/tools/llvm-omp-device-info.c b/offload/test/tools/llvm-omp-device-info.c index 6f497309df2f..1ce8d4ac07f6 100644 --- a/offload/test/tools/llvm-omp-device-info.c +++ b/offload/test/tools/llvm-omp-device-info.c @@ -2,5 +2,5 @@ // // Just check any device was found and something is printed // -// CHECK: Found {{[1-9].*}} devices: -// CHECK: Device 0: +// CHECK: Num Devices: {{[1-9].*}} +// CHECK: [{{[1-9A-Za-z].*}}] diff --git a/offload/test/tools/offload-tblgen/default_returns.td b/offload/test/tools/offload-tblgen/default_returns.td index e919492cc5bf..41949db7226a 100644 --- a/offload/test/tools/offload-tblgen/default_returns.td +++ b/offload/test/tools/offload-tblgen/default_returns.td @@ -6,13 +6,11 @@ include "APIDefs.td" -def : Handle { - let name = "ol_foo_handle_t"; +def ol_foo_handle_t : Handle { let desc = "Example handle type"; } -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/entry_points.td b/offload/test/tools/offload-tblgen/entry_points.td index c66d5b488b46..94ea820d453e 100644 --- a/offload/test/tools/offload-tblgen/entry_points.td +++ b/offload/test/tools/offload-tblgen/entry_points.td @@ -4,8 +4,7 @@ include "APIDefs.td" -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/functions_basic.td b/offload/test/tools/offload-tblgen/functions_basic.td index dec93577b57e..2802c78a2947 100644 --- a/offload/test/tools/offload-tblgen/functions_basic.td +++ b/offload/test/tools/offload-tblgen/functions_basic.td @@ -6,8 +6,7 @@ include "APIDefs.td" -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/functions_code_loc.td b/offload/test/tools/offload-tblgen/functions_code_loc.td index aec20129343f..8d7aa00c5f15 100644 --- a/offload/test/tools/offload-tblgen/functions_code_loc.td +++ b/offload/test/tools/offload-tblgen/functions_code_loc.td @@ -7,8 +7,7 @@ include "APIDefs.td" -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/functions_ranged_param.td b/offload/test/tools/offload-tblgen/functions_ranged_param.td index d0996b231973..1ce8b394b157 100644 --- a/offload/test/tools/offload-tblgen/functions_ranged_param.td +++ b/offload/test/tools/offload-tblgen/functions_ranged_param.td @@ -8,13 +8,11 @@ include "APIDefs.td" -def : Handle { - let name = "some_handle_t"; +def some_handle_t : Handle { let desc = "An example handle type"; } -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/print_enum.td b/offload/test/tools/offload-tblgen/print_enum.td index 97f869689293..c7573a9a415c 100644 --- a/offload/test/tools/offload-tblgen/print_enum.td +++ b/offload/test/tools/offload-tblgen/print_enum.td @@ -4,8 +4,7 @@ include "APIDefs.td" -def : Enum { - let name = "my_enum_t"; +def my_enum_t : Enum { let desc = "An example enum"; let etors =[ Etor<"VALUE_ONE", "The first enum value">, diff --git a/offload/test/tools/offload-tblgen/print_function.td b/offload/test/tools/offload-tblgen/print_function.td index ce1fe4c52760..74b39f145a40 100644 --- a/offload/test/tools/offload-tblgen/print_function.td +++ b/offload/test/tools/offload-tblgen/print_function.td @@ -5,13 +5,11 @@ include "APIDefs.td" -def : Handle { - let name = "ol_foo_handle_t"; +def ol_foo_handle_t : Handle { let desc = "Example handle type"; } -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/type_tagged_enum.td b/offload/test/tools/offload-tblgen/type_tagged_enum.td index 95964e32f0c9..b32531aac9c8 100644 --- a/offload/test/tools/offload-tblgen/type_tagged_enum.td +++ b/offload/test/tools/offload-tblgen/type_tagged_enum.td @@ -9,13 +9,11 @@ include "APIDefs.td" -def : Handle { - let name = "some_handle_t"; +def some_handle_t: Handle { let desc = "An example handle type"; } -def : Enum { - let name = "my_type_tagged_enum_t"; +def my_type_tagged_enum_t : Enum { let desc = "Example type tagged enum"; let is_typed = 1; let etors = [ @@ -34,8 +32,7 @@ def : Enum { // CHECK-API-NEXT: [some_handle_t] Value three. // CHECK-API-NEXT: MY_TYPE_TAGGED_ENUM_VALUE_THREE = 2, -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/tools/deviceinfo/CMakeLists.txt b/offload/tools/deviceinfo/CMakeLists.txt index 3787c12f940a..cc2d0a6add8b 100644 --- a/offload/tools/deviceinfo/CMakeLists.txt +++ b/offload/tools/deviceinfo/CMakeLists.txt @@ -4,10 +4,6 @@ add_openmp_tool(llvm-offload-device-info llvm-offload-device-info.cpp) llvm_update_compile_flags(llvm-offload-device-info) -target_include_directories(llvm-offload-device-info PRIVATE - ${LIBOMPTARGET_INCLUDE_DIR} -) target_link_libraries(llvm-offload-device-info PRIVATE - omp - omptarget + LLVMOffload ) diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp index 2228fbf3ec17..a2955d49d396 100644 --- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp +++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp @@ -1,4 +1,4 @@ -//===- llvm-offload-device-info.cpp - Device info as seen by LLVM/Offload -===// +//===- llvm-offload-device-info.cpp - Print liboffload properties ---------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,27 +6,270 @@ // //===----------------------------------------------------------------------===// // -// This is a command line utility that, by using LLVM/Offload, and the device -// plugins, list devices information as seen by the runtime. +// This is a command line utility that, by using the new liboffload API, prints +// all devices and properties // //===----------------------------------------------------------------------===// -#include "omptarget.h" -#include <cstdio> +#include <OffloadAPI.h> +#include <iostream> +#include <vector> -int main(int argc, char **argv) { - __tgt_bin_desc EmptyDesc = {0, nullptr, nullptr, nullptr}; - __tgt_register_lib(&EmptyDesc); - __tgt_init_all_rtls(); +#define OFFLOAD_ERR(X) \ + if (auto Err = X) { \ + return Err; \ + } + +enum class PrintKind { + NORMAL, + FP_FLAGS, +}; + +template <typename T, PrintKind PK = PrintKind::NORMAL> +void doWrite(std::ostream &S, T &&Val) { + S << Val; +} + +template <> +void doWrite<ol_platform_backend_t>(std::ostream &S, + ol_platform_backend_t &&Val) { + switch (Val) { + case OL_PLATFORM_BACKEND_UNKNOWN: + S << "UNKNOWN"; + break; + case OL_PLATFORM_BACKEND_CUDA: + S << "CUDA"; + break; + case OL_PLATFORM_BACKEND_AMDGPU: + S << "AMDGPU"; + break; + case OL_PLATFORM_BACKEND_HOST: + S << "HOST"; + break; + default: + S << "<< INVALID >>"; + break; + } +} +template <> +void doWrite<ol_device_type_t>(std::ostream &S, ol_device_type_t &&Val) { + switch (Val) { + case OL_DEVICE_TYPE_GPU: + S << "GPU"; + break; + case OL_DEVICE_TYPE_CPU: + S << "CPU"; + break; + case OL_DEVICE_TYPE_HOST: + S << "HOST"; + break; + default: + S << "<< INVALID >>"; + break; + } +} +template <> +void doWrite<ol_dimensions_t>(std::ostream &S, ol_dimensions_t &&Val) { + S << "{x: " << Val.x << ", y: " << Val.y << ", z: " << Val.z << "}"; +} +template <> +void doWrite<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>( + std::ostream &S, ol_device_fp_capability_flags_t &&Val) { + S << Val << " {"; + + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT) { + S << " CORRECTLY_ROUNDED_DIVIDE_SQRT"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST) { + S << " ROUND_TO_NEAREST"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO) { + S << " ROUND_TO_ZERO"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF) { + S << " ROUND_TO_INF"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN) { + S << " INF_NAN"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_DENORM) { + S << " DENORM"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_FMA) { + S << " FMA"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_SOFT_FLOAT) { + S << " SOFT_FLOAT"; + } + + S << " }"; +} - printf("Found %d devices:\n", omp_get_num_devices()); - for (int Dev = 0; Dev < omp_get_num_devices(); Dev++) { - printf(" Device %d:\n", Dev); - if (!__tgt_print_device_info(Dev)) - printf(" print_device_info not implemented\n"); - printf("\n"); +template <typename T> +ol_result_t printPlatformValue(std::ostream &S, ol_platform_handle_t Plat, + ol_platform_info_t Info, const char *Desc) { + S << Desc << ": "; + + if constexpr (std::is_pointer_v<T>) { + std::vector<uint8_t> Val; + size_t Size; + OFFLOAD_ERR(olGetPlatformInfoSize(Plat, Info, &Size)); + Val.resize(Size); + OFFLOAD_ERR(olGetPlatformInfo(Plat, Info, sizeof(Val), Val.data())); + doWrite(S, reinterpret_cast<T>(Val.data())); + } else { + T Val; + OFFLOAD_ERR(olGetPlatformInfo(Plat, Info, sizeof(Val), &Val)); + doWrite(S, std::move(Val)); + } + S << "\n"; + return OL_SUCCESS; +} + +template <typename T, PrintKind PK = PrintKind::NORMAL> +ol_result_t printDeviceValue(std::ostream &S, ol_device_handle_t Dev, + ol_device_info_t Info, const char *Desc, + const char *Units = nullptr) { + S << Desc << ": "; + + if constexpr (std::is_pointer_v<T>) { + std::vector<uint8_t> Val; + size_t Size; + OFFLOAD_ERR(olGetDeviceInfoSize(Dev, Info, &Size)); + Val.resize(Size); + OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, sizeof(Val), Val.data())); + doWrite<T, PK>(S, reinterpret_cast<T>(Val.data())); + } else { + T Val; + OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, sizeof(Val), &Val)); + doWrite<T, PK>(S, std::move(Val)); + } + if (Units) + S << " " << Units; + S << "\n"; + return OL_SUCCESS; +} + +ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) { + ol_platform_handle_t Platform; + OFFLOAD_ERR( + olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), &Platform)); + + std::vector<char> Name; + size_t NameSize; + OFFLOAD_ERR(olGetDeviceInfoSize(D, OL_DEVICE_INFO_PRODUCT_NAME, &NameSize)) + Name.resize(NameSize); + OFFLOAD_ERR( + olGetDeviceInfo(D, OL_DEVICE_INFO_PRODUCT_NAME, NameSize, Name.data())); + S << "[" << Name.data() << "]\n"; + + OFFLOAD_ERR(printPlatformValue<const char *>( + S, Platform, OL_PLATFORM_INFO_NAME, "Platform Name")); + OFFLOAD_ERR(printPlatformValue<const char *>( + S, Platform, OL_PLATFORM_INFO_VENDOR_NAME, "Platform Vendor Name")); + OFFLOAD_ERR(printPlatformValue<const char *>( + S, Platform, OL_PLATFORM_INFO_VERSION, "Platform Version")); + OFFLOAD_ERR(printPlatformValue<ol_platform_backend_t>( + S, Platform, OL_PLATFORM_INFO_BACKEND, "Platform Backend")); + + OFFLOAD_ERR( + printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_NAME, "Name")); + OFFLOAD_ERR( + printDeviceValue<ol_device_type_t>(S, D, OL_DEVICE_INFO_TYPE, "Type")); + OFFLOAD_ERR(printDeviceValue<const char *>( + S, D, OL_DEVICE_INFO_DRIVER_VERSION, "Driver Version")); + OFFLOAD_ERR(printDeviceValue<uint32_t>( + S, D, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, "Max Work Group Size")); + OFFLOAD_ERR(printDeviceValue<ol_dimensions_t>( + S, D, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, + "Max Work Group Size Per Dimension")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_MAX_WORK_SIZE, + "Max Work Size")); + OFFLOAD_ERR(printDeviceValue<ol_dimensions_t>( + S, D, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, + "Max Work Size Per Dimension")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_VENDOR_ID, "Vendor ID")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NUM_COMPUTE_UNITS, + "Num Compute Units")); + OFFLOAD_ERR(printDeviceValue<uint32_t>( + S, D, OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY, "Max Clock Frequency", "MHz")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_MEMORY_CLOCK_RATE, + "Memory Clock Rate", "MHz")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_ADDRESS_BITS, + "Address Bits")); + OFFLOAD_ERR(printDeviceValue<uint64_t>( + S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B")); + OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE, + "Global Mem Size", "B")); + OFFLOAD_ERR( + (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>( + S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG, + "Single Precision Floating Point Capability"))); + OFFLOAD_ERR( + (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>( + S, D, OL_DEVICE_INFO_DOUBLE_FP_CONFIG, + "Double Precision Floating Point Capability"))); + OFFLOAD_ERR( + (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>( + S, D, OL_DEVICE_INFO_HALF_FP_CONFIG, + "Half Precision Floating Point Capability"))); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR, + "Native Vector Width For Char")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT, + "Native Vector Width For Short")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT, + "Native Vector Width For Int")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG, + "Native Vector Width For Long")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT, + "Native Vector Width For Float")); + OFFLOAD_ERR(printDeviceValue<uint32_t>( + S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE, + "Native Vector Width For Double")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF, + "Native Vector Width For Half")); + + return OL_SUCCESS; +} + +ol_result_t printRoot(std::ostream &S) { + OFFLOAD_ERR(olInit()); + S << "Liboffload Version: " << OL_VERSION_MAJOR << "." << OL_VERSION_MINOR + << "." << OL_VERSION_PATCH << "\n"; + + std::vector<ol_device_handle_t> Devices; + OFFLOAD_ERR(olIterateDevices( + [](ol_device_handle_t Device, void *UserData) { + reinterpret_cast<decltype(Devices) *>(UserData)->push_back(Device); + return true; + }, + &Devices)); + + S << "Num Devices: " << Devices.size() << "\n"; + + for (auto &D : Devices) { + S << "\n"; + OFFLOAD_ERR(printDevice(S, D)); } - __tgt_unregister_lib(&EmptyDesc); + OFFLOAD_ERR(olShutDown()); + return OL_SUCCESS; +} + +int main(int argc, char **argv) { + auto Err = printRoot(std::cout); + + if (Err) { + std::cerr << "[Liboffload error " << Err->Code << "]: " << Err->Details + << "\n"; + return 1; + } return 0; } diff --git a/offload/unittests/Conformance/README.md b/offload/unittests/Conformance/README.md new file mode 100644 index 000000000000..0202242c99a0 --- /dev/null +++ b/offload/unittests/Conformance/README.md @@ -0,0 +1,83 @@ +# GPU Math Conformance Tests + +## Overview + +This test suite provides a framework to systematically measure the accuracy of math functions on GPUs and verify their conformance with standards like OpenCL. + +While the primary focus is validating the implementations in the C standard math library (LLVM-libm), these tests can also be executed against other math library providers, such as CUDA Math and HIP Math, for comparison. + +The goals of this project are to empower LLVM-libm contributors with a robust tool for validating their implementations and to build trust with end-users by providing transparent accuracy data. + +### Table of Contents + +- [Getting Started](#getting-started) +- [Running the Tests](#running-the-tests) +- [Adding New Tests](#adding-new-tests) + +## Getting Started + +This guide covers how to build the necessary dependencies, which include the new Offload API and the C standard library for both host and GPU targets. + +### System Requirements + +Before you begin, ensure your system meets the following requirements: + +- A system with an AMD or NVIDIA GPU. +- The latest proprietary GPU drivers installed. +- The corresponding development SDK for your hardware: + - **AMD:** [ROCm SDK](https://rocm.docs.amd.com) + - **NVIDIA:** [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) + +### Building the Dependencies + +The official documentation for building LLVM-libc for GPUs provides a detailed guide and should be considered the primary reference. Please follow the instructions in the **"Standard runtimes build"** section of that guide: + +- [Building the GPU C library (Official Documentation)](https://libc.llvm.org/gpu/building.html) + +> [!IMPORTANT] +> For the conformance tests, the standard `cmake` command from the official documentation must be adapted slightly. You must also add `libc` to the main `-DLLVM_ENABLE_RUNTIMES` list. This is a crucial step because the tests need a host-side build of `libc` to use as the reference oracle for validating GPU results. + +## Running the Tests + +### Default Test + +To build and run the conformance test for a given function (e.g., `logf`) against the default C standard math library `llvm-libm` provider, use the following command. This will execute the test on all available and supported platforms. + +```bash +ninja -C build/runtimes/runtimes-bins offload.conformance.logf +``` + +### Testing Other Providers + +Once the test binary has been built, you can run it against other math library providers using the `--test-configs` flag. + +- **For `cuda-math` on an NVIDIA GPU:** + + ```bash + ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=cuda-math:cuda + ``` + +- **For `hip-math` on an AMD GPU:** + + ```bash + ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=hip-math:amdgpu + ``` + +You can also run all available configurations for a test with: + +```bash +./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=all +``` + +## Adding New Tests + +To add a conformance test for a new math function, follow these steps: + +1. **Implement the Device Kernels**: Create a kernel wrapper for the new function in each provider's source file. For CUDA Math and HIP Math, you must also add a forward declaration for the vendor function in `/device_code/DeviceAPIs.hpp`. + +2. **Implement the Host Test**: Create a new `.cpp` file in `/tests`. This file defines the `FunctionConfig` (function and kernel names, as well as ULP tolerance) and the input generation strategy. + + - Use **exhaustive testing** (`ExhaustiveGenerator`) for functions with small input spaces (e.g., half-precision functions and single-precision univariate functions). This strategy iterates over every representable point in the input space, ensuring complete coverage. + - Use **randomized testing** (`RandomGenerator`) for functions with large input spaces (e.g., single-precision bivariate and double-precision functions), where exhaustive testing is computationally infeasible. Although not exhaustive, this strategy is deterministic, using a fixed seed to sample a large, reproducible subset of points from the input space. + +3. **Add the Build Target**: Add a new `add_conformance_test(...)` entry to `/tests/CMakeLists.txt` to make the test buildable. diff --git a/offload/unittests/Conformance/lib/DeviceContext.cpp b/offload/unittests/Conformance/lib/DeviceContext.cpp index a0068c3cb59c..6c3425f1e17c 100644 --- a/offload/unittests/Conformance/lib/DeviceContext.cpp +++ b/offload/unittests/Conformance/lib/DeviceContext.cpp @@ -55,13 +55,14 @@ static OffloadInitWrapper Wrapper{}; [[nodiscard]] std::string getDeviceName(ol_device_handle_t DeviceHandle) { std::size_t PropSize = 0; - OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_NAME, &PropSize)); + OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME, + &PropSize)); if (PropSize == 0) return ""; std::string PropValue(PropSize, '\0'); - OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_NAME, PropSize, + OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME, PropSize, PropValue.data())); PropValue.pop_back(); // Remove the null terminator diff --git a/offload/unittests/OffloadAPI/common/Environment.cpp b/offload/unittests/OffloadAPI/common/Environment.cpp index ef092cd4187d..c9da6ef9be7c 100644 --- a/offload/unittests/OffloadAPI/common/Environment.cpp +++ b/offload/unittests/OffloadAPI/common/Environment.cpp @@ -41,9 +41,9 @@ raw_ostream &operator<<(raw_ostream &Out, raw_ostream &operator<<(raw_ostream &Out, const ol_device_handle_t &Device) { size_t Size; - olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size); + olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size); std::vector<char> Name(Size); - olGetDeviceInfo(Device, OL_DEVICE_INFO_NAME, Size, Name.data()); + olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data()); Out << Name.data(); return Out; } @@ -129,6 +129,9 @@ const std::vector<TestEnvironment::Device> &TestEnvironment::getDevices() { } } + if (Devices.size() == 0) + errs() << "Warning: No devices found for OffloadAPI tests.\n"; + return Devices; } diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp index fe7198a9c283..c5a35faba7a2 100644 --- a/offload/unittests/OffloadAPI/common/Fixtures.hpp +++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp @@ -89,6 +89,40 @@ template <typename Fn> inline void threadify(Fn body) { } } +/// Enqueues a task to the queue that can be manually resolved. +// It will block until `trigger` is called. +struct ManuallyTriggeredTask { + std::mutex M; + std::condition_variable CV; + bool Flag = false; + ol_event_handle_t CompleteEvent; + + ol_result_t enqueue(ol_queue_handle_t Queue) { + if (auto Err = olLaunchHostFunction( + Queue, + [](void *That) { + static_cast<ManuallyTriggeredTask *>(That)->wait(); + }, + this)) + return Err; + + return olCreateEvent(Queue, &CompleteEvent); + } + + void wait() { + std::unique_lock<std::mutex> lk(M); + CV.wait_for(lk, std::chrono::milliseconds(1000), [&] { return Flag; }); + EXPECT_TRUE(Flag); + } + + ol_result_t trigger() { + Flag = true; + CV.notify_one(); + + return olSyncEvent(CompleteEvent); + } +}; + struct OffloadTest : ::testing::Test { ol_device_handle_t Host = TestEnvironment::getHostDevice(); }; @@ -216,9 +250,13 @@ struct OffloadEventTest : OffloadQueueTest { ol_event_handle_t Event = nullptr; }; +// Devices might not be available for offload testing, so allow uninstantiated +// tests (as the device list will be empty). This means that all tests requiring +// a device will be silently skipped. #define OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(FIXTURE) \ INSTANTIATE_TEST_SUITE_P( \ , FIXTURE, ::testing::ValuesIn(TestEnvironment::getDevices()), \ [](const ::testing::TestParamInfo<TestEnvironment::Device> &info) { \ return SanitizeString(info.param.Name); \ - }) + }); \ + GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(FIXTURE) diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp index 212a5d6ddf22..8cb0b8065c33 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp @@ -86,6 +86,29 @@ TEST_P(olGetDeviceInfoTest, HostName) { ASSERT_EQ(std::strlen(Name.data()), Size - 1); } +TEST_P(olGetDeviceInfoTest, SuccessProductName) { + size_t Size = 0; + ASSERT_SUCCESS( + olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> Name; + Name.resize(Size); + ASSERT_SUCCESS( + olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data())); + ASSERT_EQ(std::strlen(Name.data()), Size - 1); +} + +TEST_P(olGetDeviceInfoTest, HostProductName) { + size_t Size = 0; + ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> Name; + Name.resize(Size); + ASSERT_SUCCESS( + olGetDeviceInfo(Host, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data())); + ASSERT_EQ(std::strlen(Name.data()), Size - 1); +} + TEST_P(olGetDeviceInfoTest, SuccessVendor) { size_t Size = 0; ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size)); @@ -122,6 +145,19 @@ TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) { ASSERT_GT(Value.z, 0u); } +OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_SIZE, 0); + +TEST_P(olGetDeviceInfoTest, SuccessMaxWorkSizePerDimension) { + ol_dimensions_t Value{0, 0, 0}; + ASSERT_SUCCESS(olGetDeviceInfo(Device, + OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, + sizeof(Value), &Value)); + ASSERT_GT(Value.x, 0u); + ASSERT_GT(Value.y, 0u); + ASSERT_GT(Value.z, 0u); +} + OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID, 0); OL_DEVICE_INFO_TEST_HOST_SUCCESS(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID); diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp index a28089d918e0..c4a3c2d5e3c7 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp @@ -31,10 +31,13 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(Type, ol_device_type_t, OL_DEVICE_INFO_TYPE); OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t, OL_DEVICE_INFO_PLATFORM); OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME); OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR); OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION); OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_SIZE); OL_DEVICE_INFO_SIZE_TEST_EQ(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID); OL_DEVICE_INFO_SIZE_TEST_EQ(NumComputeUnits, uint32_t, OL_DEVICE_INFO_NUM_COMPUTE_UNITS); @@ -76,6 +79,14 @@ TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) { ASSERT_EQ(Size, sizeof(uint32_t) * 3); } +TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkSizePerDimension) { + size_t Size = 0; + ASSERT_SUCCESS(olGetDeviceInfoSize( + Device, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, &Size)); + ASSERT_EQ(Size, sizeof(ol_dimensions_t)); + ASSERT_EQ(Size, sizeof(uint32_t) * 3); +} + TEST_P(olGetDeviceInfoSizeTest, InvalidNullHandle) { size_t Size = 0; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp index 1b0bafa20208..a84ed3d78ecc 100644 --- a/offload/unittests/OffloadAPI/memory/olMemFill.cpp +++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp @@ -10,75 +10,129 @@ #include <OffloadAPI.h> #include <gtest/gtest.h> -using olMemFillTest = OffloadQueueTest; +struct olMemFillTest : OffloadQueueTest { + template <typename PatternTy, PatternTy PatternVal, size_t Size, + bool Block = false> + void test_body() { + ManuallyTriggeredTask Manual; + + // Block/enqueue tests ensure that the test has been enqueued to a queue + // (rather than being done synchronously if the queue happens to be empty) + if constexpr (Block) { + ASSERT_SUCCESS(Manual.enqueue(Queue)); + } + + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + PatternTy Pattern = PatternVal; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + if constexpr (Block) { + ASSERT_SUCCESS(Manual.trigger()); + } + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternTy *AllocPtr = reinterpret_cast<PatternTy *>(Alloc); + ASSERT_EQ(AllocPtr[i], Pattern); + } + + olMemFree(Alloc); + } +}; OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest); -TEST_P(olMemFillTest, Success8) { - constexpr size_t Size = 1024; - void *Alloc; - ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); - - uint8_t Pattern = 0x42; - ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); - - olSyncQueue(Queue); +TEST_P(olMemFillTest, Success8) { test_body<uint8_t, 0x42, 1024>(); } +TEST_P(olMemFillTest, Success8NotMultiple4) { + test_body<uint8_t, 0x42, 1023>(); +} +TEST_P(olMemFillTest, Success8Enqueue) { + test_body<uint8_t, 0x42, 1024, true>(); +} +TEST_P(olMemFillTest, Success8NotMultiple4Enqueue) { + test_body<uint8_t, 0x42, 1023, true>(); +} - size_t N = Size / sizeof(Pattern); - for (size_t i = 0; i < N; i++) { - uint8_t *AllocPtr = reinterpret_cast<uint8_t *>(Alloc); - ASSERT_EQ(AllocPtr[i], Pattern); - } +TEST_P(olMemFillTest, Success16) { test_body<uint8_t, 0x42, 1024>(); } +TEST_P(olMemFillTest, Success16NotMultiple4) { + test_body<uint16_t, 0x4243, 1022>(); +} +TEST_P(olMemFillTest, Success16Enqueue) { + test_body<uint8_t, 0x42, 1024, true>(); +} +TEST_P(olMemFillTest, Success16NotMultiple4Enqueue) { + test_body<uint16_t, 0x4243, 1022, true>(); +} - olMemFree(Alloc); +TEST_P(olMemFillTest, Success32) { test_body<uint32_t, 0xDEADBEEF, 1024>(); } +TEST_P(olMemFillTest, Success32Enqueue) { + test_body<uint32_t, 0xDEADBEEF, 1024, true>(); } -TEST_P(olMemFillTest, Success16) { +TEST_P(olMemFillTest, SuccessLarge) { constexpr size_t Size = 1024; void *Alloc; ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); - uint16_t Pattern = 0x4242; + struct PatternT { + uint64_t A; + uint64_t B; + } Pattern{UINT64_MAX, UINT64_MAX}; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); olSyncQueue(Queue); size_t N = Size / sizeof(Pattern); for (size_t i = 0; i < N; i++) { - uint16_t *AllocPtr = reinterpret_cast<uint16_t *>(Alloc); - ASSERT_EQ(AllocPtr[i], Pattern); + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); } olMemFree(Alloc); } -TEST_P(olMemFillTest, Success32) { +TEST_P(olMemFillTest, SuccessLargeEnqueue) { constexpr size_t Size = 1024; void *Alloc; + ManuallyTriggeredTask Manual; + ASSERT_SUCCESS(Manual.enqueue(Queue)); + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); - uint32_t Pattern = 0xDEADBEEF; + struct PatternT { + uint64_t A; + uint64_t B; + } Pattern{UINT64_MAX, UINT64_MAX}; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + Manual.trigger(); olSyncQueue(Queue); size_t N = Size / sizeof(Pattern); for (size_t i = 0; i < N; i++) { - uint32_t *AllocPtr = reinterpret_cast<uint32_t *>(Alloc); - ASSERT_EQ(AllocPtr[i], Pattern); + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); } olMemFree(Alloc); } -TEST_P(olMemFillTest, SuccessLarge) { - constexpr size_t Size = 1024; +TEST_P(olMemFillTest, SuccessLargeByteAligned) { + constexpr size_t Size = 17 * 64; void *Alloc; ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); - struct PatternT { + struct __attribute__((packed)) PatternT { uint64_t A; uint64_t B; - } Pattern{UINT64_MAX, UINT64_MAX}; + uint8_t C; + } Pattern{UINT64_MAX, UINT64_MAX, 255}; ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); @@ -89,14 +143,18 @@ TEST_P(olMemFillTest, SuccessLarge) { PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].C, 255); } olMemFree(Alloc); } -TEST_P(olMemFillTest, SuccessLargeByteAligned) { +TEST_P(olMemFillTest, SuccessLargeByteAlignedEnqueue) { constexpr size_t Size = 17 * 64; void *Alloc; + ManuallyTriggeredTask Manual; + ASSERT_SUCCESS(Manual.enqueue(Queue)); + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); struct __attribute__((packed)) PatternT { @@ -107,6 +165,7 @@ TEST_P(olMemFillTest, SuccessLargeByteAligned) { ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + Manual.trigger(); olSyncQueue(Queue); size_t N = Size / sizeof(Pattern); diff --git a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp index 0dc8527df532..aa9e372ede2c 100644 --- a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp +++ b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp @@ -18,6 +18,15 @@ TEST_P(olDestroyQueueTest, Success) { Queue = nullptr; } +TEST_P(olDestroyQueueTest, SuccessDelayedResolution) { + ManuallyTriggeredTask Manual; + ASSERT_SUCCESS(Manual.enqueue(Queue)); + ASSERT_SUCCESS(olDestroyQueue(Queue)); + Queue = nullptr; + + ASSERT_SUCCESS(Manual.trigger()); +} + TEST_P(olDestroyQueueTest, InvalidNullHandle) { ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, olDestroyQueue(nullptr)); } |
