diff options
Diffstat (limited to 'offload/DeviceRTL')
29 files changed, 0 insertions, 5845 deletions
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h deleted file mode 100644 index dc4d029ed75f..000000000000 --- a/offload/DeviceRTL/include/Allocator.h +++ /dev/null @@ -1,45 +0,0 @@ -//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_ALLOCATOR_H -#define OMPTARGET_ALLOCATOR_H - -#include "DeviceTypes.h" - -// Forward declaration. -struct KernelEnvironmentTy; - -namespace ompx { - -namespace allocator { - -static uint64_t constexpr ALIGNMENT = 16; - -/// Initialize the allocator according to \p KernelEnvironment -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment); - -/// Allocate \p Size bytes. -[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void * -alloc(uint64_t Size); - -/// Free the allocation pointed to by \p Ptr. -void free(void *Ptr); - -} // namespace allocator - -} // namespace ompx - -extern "C" { -void *malloc(size_t Size); -void free(void *Ptr); -} - -#endif diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h deleted file mode 100644 index 95408933dd86..000000000000 --- a/offload/DeviceRTL/include/Configuration.h +++ /dev/null @@ -1,68 +0,0 @@ -//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// API to query the global (constant) device environment. -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_CONFIGURATION_H -#define OMPTARGET_CONFIGURATION_H - -#include "Shared/Environment.h" - -#include "DeviceTypes.h" - -namespace ompx { -namespace config { - -/// Return the number of devices in the system, same number as returned on the -/// host by omp_get_num_devices. -uint32_t getNumDevices(); - -/// Return the device number in the system for omp_get_device_num. -uint32_t getDeviceNum(); - -/// Return the user chosen debug level. -uint32_t getDebugKind(); - -/// Return if teams oversubscription is assumed -uint32_t getAssumeTeamsOversubscription(); - -/// Return if threads oversubscription is assumed -uint32_t getAssumeThreadsOversubscription(); - -/// Return the amount of dynamic shared memory that was allocated at launch. -uint64_t getDynamicMemorySize(); - -/// Returns the cycles per second of the device's fixed frequency clock. -uint64_t getClockFrequency(); - -/// Returns the pointer to the beginning of the indirect call table. -void *getIndirectCallTablePtr(); - -/// Returns the size of the indirect call table. -uint64_t getIndirectCallTableSize(); - -/// Returns the size of the indirect call table. -uint64_t getHardwareParallelism(); - -/// Return if debugging is enabled for the given debug kind. -bool isDebugMode(DeviceDebugKind Level); - -/// Indicates if this kernel may require thread-specific states, or if it was -/// explicitly disabled by the user. -bool mayUseThreadStates(); - -/// Indicates if this kernel may require data environments for nested -/// parallelism, or if it was explicitly disabled by the user. -bool mayUseNestedParallelism(); - -} // namespace config -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Debug.h b/offload/DeviceRTL/include/Debug.h deleted file mode 100644 index 98d0fa498d95..000000000000 --- a/offload/DeviceRTL/include/Debug.h +++ /dev/null @@ -1,44 +0,0 @@ -//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_DEBUG_H -#define OMPTARGET_DEVICERTL_DEBUG_H - -#include "Configuration.h" -#include "LibC.h" - -/// Assertion -/// -/// { -extern "C" { -void __assert_assume(bool condition); -void __assert_fail(const char *expr, const char *file, unsigned line, - const char *function); -void __assert_fail_internal(const char *expr, const char *msg, const char *file, - unsigned line, const char *function); -} - -#define ASSERT(expr, msg) \ - { \ - if (config::isDebugMode(DeviceDebugKind::Assertion) && !(expr)) \ - __assert_fail_internal(#expr, msg, __FILE__, __LINE__, \ - __PRETTY_FUNCTION__); \ - else \ - __assert_assume(expr); \ - } -#define UNREACHABLE(msg) \ - printf(msg); \ - __builtin_trap(); \ - __builtin_unreachable(); - -///} - -#endif diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h deleted file mode 100644 index 111143a5578f..000000000000 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ /dev/null @@ -1,172 +0,0 @@ -//===---------- DeviceTypes.h - OpenMP types ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_TYPES_H -#define OMPTARGET_TYPES_H - -#include <gpuintrin.h> -#include <stddef.h> -#include <stdint.h> - -template <typename T> using Private = __gpu_private T; -template <typename T> using Constant = __gpu_constant T; -template <typename T> using Local = __gpu_local T; -template <typename T> using Global = __gpu_local T; - -enum omp_proc_bind_t { - omp_proc_bind_false = 0, - omp_proc_bind_true = 1, - omp_proc_bind_master = 2, - omp_proc_bind_close = 3, - omp_proc_bind_spread = 4 -}; - -enum omp_sched_t { - omp_sched_static = 1, /* chunkSize >0 */ - omp_sched_dynamic = 2, /* chunkSize >0 */ - omp_sched_guided = 3, /* chunkSize >0 */ - omp_sched_auto = 4, /* no chunkSize */ -}; - -enum kmp_sched_t { - kmp_sched_static_chunk = 33, - kmp_sched_static_nochunk = 34, - kmp_sched_dynamic = 35, - kmp_sched_guided = 36, - kmp_sched_runtime = 37, - kmp_sched_auto = 38, - - kmp_sched_static_balanced_chunk = 45, - - kmp_sched_static_ordered = 65, - kmp_sched_static_nochunk_ordered = 66, - kmp_sched_dynamic_ordered = 67, - kmp_sched_guided_ordered = 68, - kmp_sched_runtime_ordered = 69, - kmp_sched_auto_ordered = 70, - - kmp_sched_distr_static_chunk = 91, - kmp_sched_distr_static_nochunk = 92, - kmp_sched_distr_static_chunk_sched_static_chunkone = 93, - - kmp_sched_default = kmp_sched_static_nochunk, - kmp_sched_unordered_first = kmp_sched_static_chunk, - kmp_sched_unordered_last = kmp_sched_auto, - kmp_sched_ordered_first = kmp_sched_static_ordered, - kmp_sched_ordered_last = kmp_sched_auto_ordered, - kmp_sched_distribute_first = kmp_sched_distr_static_chunk, - kmp_sched_distribute_last = - kmp_sched_distr_static_chunk_sched_static_chunkone, - - /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. - * Since we need to distinguish the three possible cases (no modifier, - * monotonic modifier, nonmonotonic modifier), we need separate bits for - * each modifier. The absence of monotonic does not imply nonmonotonic, - * especially since 4.5 says that the behaviour of the "no modifier" case - * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. - * - * Since we're passing a full 32 bit value, we can use a couple of high - * bits for these flags; out of paranoia we avoid the sign bit. - * - * These modifiers can be or-ed into non-static schedules by the compiler - * to pass the additional information. They will be stripped early in the - * processing in __kmp_dispatch_init when setting up schedules, so - * most of the code won't ever see schedules with these bits set. - */ - kmp_sched_modifier_monotonic = (1 << 29), - /**< Set if the monotonic schedule modifier was present */ - kmp_sched_modifier_nonmonotonic = (1 << 30), -/**< Set if the nonmonotonic schedule modifier was present */ - -#define SCHEDULE_WITHOUT_MODIFIERS(s) \ - (enum kmp_sched_t)( \ - (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) -#define SCHEDULE_HAS_MONOTONIC(s) (((s) & kmp_sched_modifier_monotonic) != 0) -#define SCHEDULE_HAS_NONMONOTONIC(s) \ - (((s) & kmp_sched_modifier_nonmonotonic) != 0) -#define SCHEDULE_HAS_NO_MODIFIERS(s) \ - (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ - 0) - -}; - -struct TaskDescriptorTy; -using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr); -struct TaskDescriptorTy { - void *Payload; - TaskFnTy TaskFn; -}; - -using LaneMaskTy = uint64_t; - -namespace lanes { -enum : LaneMaskTy { All = ~(LaneMaskTy)0 }; -} // namespace lanes - -/// The ident structure that describes a source location. The struct is -/// identical to the one in the kmp.h file. We maintain the same data structure -/// for compatibility. -struct IdentTy { - int32_t reserved_1; /**< might be used in Fortran; see above */ - int32_t flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC - identifies this union member */ - int32_t reserved_2; /**< not really used in Fortran any more; see above */ - int32_t reserved_3; /**< source[4] in Fortran, do not use for C++ */ - char const *psource; /**< String describing the source location. - The string is composed of semi-colon separated fields - which describe the source file, the function and a pair - of line numbers that delimit the construct. */ -}; - -using __kmpc_impl_lanemask_t = LaneMaskTy; - -using ParallelRegionFnTy = void *; - -using CriticalNameTy = int32_t[8]; - -struct omp_lock_t { - void *Lock; -}; - -// see definition in openmp/runtime kmp.h -typedef enum omp_severity_t { - severity_warning = 1, - severity_fatal = 2 -} omp_severity_t; - -using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num); -using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id, - int16_t lane_offset, int16_t shortCircuit); -using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data); - -/// Macros for allocating variables in different address spaces. -///{ - -// Follows the pattern in interface.h -typedef enum omp_allocator_handle_t { - omp_null_allocator = 0, - omp_default_mem_alloc = 1, - omp_large_cap_mem_alloc = 2, - omp_const_mem_alloc = 3, - omp_high_bw_mem_alloc = 4, - omp_low_lat_mem_alloc = 5, - omp_cgroup_mem_alloc = 6, - omp_pteam_mem_alloc = 7, - omp_thread_mem_alloc = 8, - KMP_ALLOCATOR_MAX_HANDLE = ~(0LU) -} omp_allocator_handle_t; - -#define __PRAGMA(STR) _Pragma(#STR) -#define OMP_PRAGMA(STR) __PRAGMA(omp STR) - -///} - -#endif diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h deleted file mode 100644 index b92514ee9838..000000000000 --- a/offload/DeviceRTL/include/DeviceUtils.h +++ /dev/null @@ -1,96 +0,0 @@ -//===--- DeviceUtils.h - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H -#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H - -#include "DeviceTypes.h" -#include "Shared/Utils.h" - -namespace utils { - -template <typename T> struct type_identity { - using type = T; -}; - -template <typename T, T v> struct integral_constant { - inline static constexpr T value = v; -}; - -/// Freestanding SFINAE helpers. -template <class T> struct remove_cv : type_identity<T> {}; -template <class T> struct remove_cv<const T> : type_identity<T> {}; -template <class T> struct remove_cv<volatile T> : type_identity<T> {}; -template <class T> struct remove_cv<const volatile T> : type_identity<T> {}; -template <class T> using remove_cv_t = typename remove_cv<T>::type; - -using true_type = integral_constant<bool, true>; -using false_type = integral_constant<bool, false>; - -template <typename T, typename U> struct is_same : false_type {}; -template <typename T> struct is_same<T, T> : true_type {}; -template <typename T, typename U> -inline constexpr bool is_same_v = is_same<T, U>::value; - -template <typename T> struct is_floating_point { - inline static constexpr bool value = - is_same_v<remove_cv_t<T>, float> || is_same_v<remove_cv_t<T>, double>; -}; -template <typename T> -inline constexpr bool is_floating_point_v = is_floating_point<T>::value; - -template <bool B, typename T = void> struct enable_if; -template <typename T> struct enable_if<true, T> : type_identity<T> {}; -template <bool B, typename T = void> -using enable_if_t = typename enable_if<B, T>::type; - -template <class T> struct remove_addrspace : type_identity<T> {}; -template <class T, int N> -struct remove_addrspace<T [[clang::address_space(N)]]> : type_identity<T> {}; -template <class T> -using remove_addrspace_t = typename remove_addrspace<T>::type; - -template <typename To, typename From> inline To bitCast(From V) { - static_assert(sizeof(To) == sizeof(From), "Bad conversion"); - return __builtin_bit_cast(To, V); -} - -/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread -/// is identified by \p Mask. -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); - -int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); - -uint64_t ballotSync(uint64_t Mask, int32_t Pred); - -/// Return \p LowBits and \p HighBits packed into a single 64 bit value. -uint64_t pack(uint32_t LowBits, uint32_t HighBits); - -/// Unpack \p Val into \p LowBits and \p HighBits. -void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); - -/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). -bool isSharedMemPtr(void *Ptr); - -/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)). -bool isThreadLocalMemPtr(void *Ptr); - -/// A pointer variable that has by design an `undef` value. Use with care. -[[clang::loader_uninitialized]] static void *const UndefPtr; - -#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) -#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) - -} // namespace utils - -#endif diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h deleted file mode 100644 index c4bfaaa2404b..000000000000 --- a/offload/DeviceRTL/include/Interface.h +++ /dev/null @@ -1,366 +0,0 @@ -//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_INTERFACE_H -#define OMPTARGET_DEVICERTL_INTERFACE_H - -#include "Shared/Environment.h" - -#include "DeviceTypes.h" - -/// External API -/// -///{ - -extern "C" { - -/// ICV: dyn-var, constant 0 -/// -/// setter: ignored. -/// getter: returns 0. -/// -///{ -void omp_set_dynamic(int); -int omp_get_dynamic(void); -///} - -/// ICV: nthreads-var, integer -/// -/// scope: data environment -/// -/// setter: ignored. -/// getter: returns false. -/// -/// implementation notes: -/// -/// -///{ -void omp_set_num_threads(int); -int omp_get_max_threads(void); -///} - -/// ICV: thread-limit-var, computed -/// -/// getter: returns thread limited defined during launch. -/// -///{ -int omp_get_thread_limit(void); -///} - -/// ICV: max-active-level-var, constant 1 -/// -/// setter: ignored. -/// getter: returns 1. -/// -///{ -void omp_set_max_active_levels(int); -int omp_get_max_active_levels(void); -///} - -/// ICV: places-partition-var -/// -/// -///{ -///} - -/// ICV: active-level-var, 0 or 1 -/// -/// getter: returns 0 or 1. -/// -///{ -int omp_get_active_level(void); -///} - -/// ICV: level-var -/// -/// getter: returns parallel region nesting -/// -///{ -int omp_get_level(void); -///} - -/// ICV: run-sched-var -/// -/// -///{ -void omp_set_schedule(omp_sched_t, int); -void omp_get_schedule(omp_sched_t *, int *); -///} - -/// TODO this is incomplete. -int omp_get_num_threads(void); -int omp_get_thread_num(void); -void omp_set_nested(int); - -int omp_get_nested(void); - -void omp_set_max_active_levels(int Level); - -int omp_get_max_active_levels(void); - -omp_proc_bind_t omp_get_proc_bind(void); - -int omp_get_num_places(void); - -int omp_get_place_num_procs(int place_num); - -void omp_get_place_proc_ids(int place_num, int *ids); - -int omp_get_place_num(void); - -int omp_get_partition_num_places(void); - -void omp_get_partition_place_nums(int *place_nums); - -int omp_get_cancellation(void); - -void omp_set_default_device(int deviceId); - -int omp_get_default_device(void); - -int omp_get_num_devices(void); - -int omp_get_device_num(void); - -int omp_get_num_teams(void); - -int omp_get_team_num(); - -int omp_get_initial_device(void); - -void *llvm_omp_target_dynamic_shared_alloc(); - -/// Synchronization -/// -///{ -void omp_init_lock(omp_lock_t *Lock); - -void omp_destroy_lock(omp_lock_t *Lock); - -void omp_set_lock(omp_lock_t *Lock); - -void omp_unset_lock(omp_lock_t *Lock); - -int omp_test_lock(omp_lock_t *Lock); -///} - -/// Tasking -/// -///{ -int omp_in_final(void); - -int omp_get_max_task_priority(void); -///} - -/// Misc -/// -///{ -double omp_get_wtick(void); - -double omp_get_wtime(void); -///} -} - -extern "C" { -/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be -/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be -/// called by any thread, allocation happens *per thread*. -void *__kmpc_alloc_shared(uint64_t Bytes); - -/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like -/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the -/// allocated by __kmpc_alloc_shared by the same thread. -void __kmpc_free_shared(void *Ptr, uint64_t Bytes); - -/// Get a pointer to the memory buffer containing dynamically allocated shared -/// memory configured at launch. -void *__kmpc_get_dynamic_shared(); - -/// Allocate sufficient space for \p NumArgs sequential `void*` and store the -/// allocation address in \p GlobalArgs. -/// -/// Called by the main thread prior to a parallel region. -/// -/// We also remember it in GlobalArgsPtr to ensure the worker threads and -/// deallocation function know the allocation address too. -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs); - -/// Deallocate the memory allocated by __kmpc_begin_sharing_variables. -/// -/// Called by the main thread after a parallel region. -void __kmpc_end_sharing_variables(); - -/// Store the allocation address obtained via __kmpc_begin_sharing_variables in -/// \p GlobalArgs. -/// -/// Called by the worker threads in the parallel region (function). -void __kmpc_get_shared_variables(void ***GlobalArgs); - -/// External interface to get the thread ID. -uint32_t __kmpc_get_hardware_thread_id_in_block(); - -/// External interface to get the number of threads. -uint32_t __kmpc_get_hardware_num_threads_in_block(); - -/// External interface to get the warp size. -uint32_t __kmpc_get_warp_size(); - -/// Kernel -/// -///{ -// Forward declaration -struct KernelEnvironmentTy; - -int8_t __kmpc_is_spmd_exec_mode(); - -int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); - -void __kmpc_target_deinit(); - -///} - -/// Reduction -/// -///{ -void *__kmpc_reduction_get_fixed_buffer(); - -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, - uint64_t reduce_data_size, - void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct); - -int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, - uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, - ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct); -///} - -/// Synchronization -/// -///{ -void __kmpc_ordered(IdentTy *Loc, int32_t TId); - -void __kmpc_end_ordered(IdentTy *Loc, int32_t TId); - -int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId); - -void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId); - -void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId); - -void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId); - -int32_t __kmpc_master(IdentTy *Loc, int32_t TId); - -void __kmpc_end_master(IdentTy *Loc, int32_t TId); - -int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter); - -void __kmpc_end_masked(IdentTy *Loc, int32_t TId); - -int32_t __kmpc_single(IdentTy *Loc, int32_t TId); - -void __kmpc_end_single(IdentTy *Loc, int32_t TId); - -void __kmpc_flush(IdentTy *Loc); - -uint64_t __kmpc_warp_active_thread_mask(void); - -void __kmpc_syncwarp(uint64_t Mask); - -void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); - -void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); -///} - -/// Parallelism -/// -///{ -/// TODO -void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn); - -/// TODO -bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn); - -/// TODO -void __kmpc_kernel_end_parallel(); - -/// TODO -void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind); - -/// TODO -void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams, - int32_t ThreadLimit); - -/// TODO -uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t); - -///} - -/// Tasking -/// -///{ -TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, - size_t TaskSizeInclPrivateValues, - size_t SharedValuesSize, - TaskFnTy TaskFn); - -int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); - -int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int32_t, - void *, int32_t, void *); - -void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); - -void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); - -void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, - void *); - -void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId); - -void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId); - -int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int); - -int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId); - -void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int, - uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, - int32_t, uint64_t, void *); -///} - -/// Misc -/// -///{ -int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal); - -int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal); -///} - -/// Shuffle -/// -///{ -int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); -int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); - -///} -} - -#endif diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h deleted file mode 100644 index 94b5e6519606..000000000000 --- a/offload/DeviceRTL/include/LibC.h +++ /dev/null @@ -1,23 +0,0 @@ -//===--------- LibC.h - Simple implementation of libc functions --- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_LIBC_H -#define OMPTARGET_LIBC_H - -#include "DeviceTypes.h" - -namespace ompx { - -int printf(const char *Format, ...); - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h deleted file mode 100644 index 8ba018b5314a..000000000000 --- a/offload/DeviceRTL/include/Mapping.h +++ /dev/null @@ -1,108 +0,0 @@ -//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_MAPPING_H -#define OMPTARGET_MAPPING_H - -#include "DeviceTypes.h" - -namespace ompx { - -namespace mapping { - -enum { - DIM_X = __GPU_X_DIM, - DIM_Y = __GPU_Y_DIM, - DIM_Z = __GPU_Z_DIM, -}; - -inline constexpr uint32_t MaxThreadsPerTeam = 1024; - -/// Initialize the mapping machinery. -void init(bool IsSPMD); - -/// Return true if the kernel is executed in SPMD mode. -bool isSPMDMode(); - -/// Return true if the kernel is executed in generic mode. -bool isGenericMode(); - -/// Return true if the executing thread is the main thread in generic mode. -/// These functions will lookup state and it is required that that is OK for the -/// thread and location. See also `isInitialThreadInLevel0` for a stateless -/// alternative for certain situations, e.g. during initialization. -bool isMainThreadInGenericMode(); -bool isMainThreadInGenericMode(bool IsSPMD); - -/// Return true if this thread is the initial thread in parallel level 0. -/// -/// The thread for which this returns true should be used for single threaded -/// initialization tasks. We pick a special thread to ensure there are no -/// races between the initialization and the first read of initialized state. -bool isInitialThreadInLevel0(bool IsSPMD); - -/// Return true if the executing thread has the lowest Id of the active threads -/// in the warp. -bool isLeaderInWarp(); - -/// Return a mask describing all active threads in the warp. -LaneMaskTy activemask(); - -/// Return a mask describing all threads with a smaller Id in the warp. -LaneMaskTy lanemaskLT(); - -/// Return a mask describing all threads with a larger Id in the warp. -LaneMaskTy lanemaskGT(); - -/// Return the thread Id in the warp, in [0, getWarpSize()). -uint32_t getThreadIdInWarp(); - -/// Return the warp size, thus number of threads in the warp. -uint32_t getWarpSize(); - -/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()] -uint32_t getWarpIdInBlock(); - -/// Return the number of warps in the block. -uint32_t getNumberOfWarpsInBlock(); - -/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)). -uint32_t getThreadIdInBlock(int32_t Dim = DIM_X); - -/// Return the block size, thus number of threads in the block. -uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X); - -/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)). -uint32_t getBlockIdInKernel(int32_t Dim = DIM_X); - -/// Return the number of blocks in the kernel. -uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X); - -/// Return the kernel size, thus number of threads in the kernel. -uint32_t getNumberOfThreadsInKernel(); - -/// Return the maximal number of threads in the block usable for a team (= -/// parallel region). -/// -/// Note: The version taking \p IsSPMD mode explicitly can be used during the -/// initialization of the target region, that is before `mapping::isSPMDMode()` -/// can be called by any thread other than the main one. -uint32_t getMaxTeamThreads(); -uint32_t getMaxTeamThreads(bool IsSPMD); - -/// Return the number of processing elements on the device. -uint32_t getNumberOfProcessorElements(); - -} // namespace mapping - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h deleted file mode 100644 index d99475225412..000000000000 --- a/offload/DeviceRTL/include/Profiling.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_PROFILING_H -#define OMPTARGET_DEVICERTL_PROFILING_H - -extern "C" { -void __llvm_profile_register_function(void *Ptr); -void __llvm_profile_register_names_function(void *Ptr, long int I); -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2); -} - -#endif diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h deleted file mode 100644 index db396dae6e44..000000000000 --- a/offload/DeviceRTL/include/State.h +++ /dev/null @@ -1,377 +0,0 @@ -//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_STATE_H -#define OMPTARGET_STATE_H - -#include "Shared/Environment.h" - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Mapping.h" - -// Forward declaration. -struct KernelEnvironmentTy; - -namespace ompx { - -namespace memory { - -/// Alloca \p Size bytes in shared memory, if possible, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_alloc_shared for proper usage. -void *allocShared(uint64_t Size, const char *Reason); - -/// Free \p Ptr, allocated via allocShared, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_free_shared for proper usage. -void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); - -/// Alloca \p Size bytes in global memory, if possible, for \p Reason. -void *allocGlobal(uint64_t Size, const char *Reason); - -/// Return a pointer to the dynamic shared memory buffer. -void *getDynamicBuffer(); - -/// Free \p Ptr, allocated via allocGlobal, for \p Reason. -void freeGlobal(void *Ptr, const char *Reason); - -} // namespace memory - -namespace state { - -inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; - -struct ICVStateTy { - uint32_t NThreadsVar; - uint32_t LevelVar; - uint32_t ActiveLevelVar; - uint32_t Padding0Val; - uint32_t MaxActiveLevelsVar; - uint32_t RunSchedVar; - uint32_t RunSchedChunkVar; - - bool operator==(const ICVStateTy &Other) const; - - void assertEqual(const ICVStateTy &Other) const; -}; - -struct TeamStateTy { - void init(bool IsSPMD); - - bool operator==(const TeamStateTy &) const; - - void assertEqual(TeamStateTy &Other) const; - - /// ICVs - /// - /// Preallocated storage for ICV values that are used if the threads have not - /// set a custom default. The latter is supported but unlikely and slow(er). - /// - ///{ - ICVStateTy ICVState; - ///} - - uint32_t ParallelTeamSize; - uint32_t HasThreadState; - ParallelRegionFnTy ParallelRegionFnVar; -}; - -extern Local<TeamStateTy> TeamState; - -struct ThreadStateTy { - - /// ICVs have preallocated storage in the TeamStateTy which is used if a - /// thread has not set a custom value. The latter is supported but unlikely. - /// When it happens we will allocate dynamic memory to hold the values of all - /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an - /// ICV struct to hold them all. This is slower than alternatives but allows - /// users to pay only for what they use. - /// - state::ICVStateTy ICVState; - - ThreadStateTy *PreviousThreadState; - - void init() { - ICVState = TeamState.ICVState; - PreviousThreadState = nullptr; - } - - void init(ThreadStateTy *PreviousTS) { - ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; - PreviousThreadState = PreviousTS; - } -}; - -extern Local<ThreadStateTy **> ThreadStates; - -/// Initialize the state machinery. Must be called by all threads. -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); - -/// Return the kernel and kernel launch environment associated with the current -/// kernel. The former is static and contains compile time information that -/// holds for all instances of the kernel. The latter is dynamic and provides -/// per-launch information. -KernelEnvironmentTy &getKernelEnvironment(); -KernelLaunchEnvironmentTy &getKernelLaunchEnvironment(); - -/// TODO -enum ValueKind { - VK_NThreads, - VK_Level, - VK_ActiveLevel, - VK_MaxActiveLevels, - VK_RunSched, - // --- - VK_RunSchedChunk, - VK_ParallelRegionFn, - VK_ParallelTeamSize, - VK_HasThreadState, -}; - -/// TODO -void enterDataEnvironment(IdentTy *Ident); - -/// TODO -void exitDataEnvironment(); - -/// TODO -struct DateEnvironmentRAII { - DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); } - ~DateEnvironmentRAII() { exitDataEnvironment(); } -}; - -/// TODO -void resetStateForThread(uint32_t TId); - -// FIXME: https://github.com/llvm/llvm-project/issues/123241. -#define lookupForModify32Impl(Member, Ident, ForceTeamState) \ - { \ - if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() || \ - !TeamState.HasThreadState)) \ - return TeamState.ICVState.Member; \ - uint32_t TId = mapping::getThreadIdInBlock(); \ - if (OMP_UNLIKELY(!ThreadStates[TId])) { \ - ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>( \ - memory::allocGlobal(sizeof(ThreadStateTy), \ - "ICV modification outside data environment")); \ - ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!"); \ - TeamState.HasThreadState = true; \ - ThreadStates[TId]->init(); \ - } \ - return ThreadStates[TId]->ICVState.Member; \ - } - -// FIXME: https://github.com/llvm/llvm-project/issues/123241. -#define lookupImpl(Member, ForceTeamState) \ - { \ - auto TId = mapping::getThreadIdInBlock(); \ - if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() && \ - TeamState.HasThreadState && ThreadStates[TId])) \ - return ThreadStates[TId]->ICVState.Member; \ - return TeamState.ICVState.Member; \ - } - -[[gnu::always_inline, gnu::flatten]] inline uint32_t & -lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { - switch (Kind) { - case state::VK_NThreads: - if (IsReadonly) - lookupImpl(NThreadsVar, ForceTeamState); - lookupForModify32Impl(NThreadsVar, Ident, ForceTeamState); - case state::VK_Level: - if (IsReadonly) - lookupImpl(LevelVar, ForceTeamState); - lookupForModify32Impl(LevelVar, Ident, ForceTeamState); - case state::VK_ActiveLevel: - if (IsReadonly) - lookupImpl(ActiveLevelVar, ForceTeamState); - lookupForModify32Impl(ActiveLevelVar, Ident, ForceTeamState); - case state::VK_MaxActiveLevels: - if (IsReadonly) - lookupImpl(MaxActiveLevelsVar, ForceTeamState); - lookupForModify32Impl(MaxActiveLevelsVar, Ident, ForceTeamState); - case state::VK_RunSched: - if (IsReadonly) - lookupImpl(RunSchedVar, ForceTeamState); - lookupForModify32Impl(RunSchedVar, Ident, ForceTeamState); - case state::VK_RunSchedChunk: - if (IsReadonly) - lookupImpl(RunSchedChunkVar, ForceTeamState); - lookupForModify32Impl(RunSchedChunkVar, Ident, ForceTeamState); - case state::VK_ParallelTeamSize: - return TeamState.ParallelTeamSize; - case state::VK_HasThreadState: - return TeamState.HasThreadState; - default: - break; - } - __builtin_unreachable(); -} - -[[gnu::always_inline, gnu::flatten]] inline void *& -lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) { - switch (Kind) { - case state::VK_ParallelRegionFn: - return TeamState.ParallelRegionFnVar; - default: - break; - } - __builtin_unreachable(); -} - -/// A class without actual state used to provide a nice interface to lookup and -/// update ICV values we can declare in global scope. -template <typename Ty, ValueKind Kind> struct Value { - [[gnu::flatten, gnu::always_inline]] operator Ty() { - return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, - /*ForceTeamState=*/false); - } - - [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) { - set(Other, /*IdentTy=*/nullptr); - return *this; - } - - [[gnu::flatten, gnu::always_inline]] Value &operator++() { - inc(1, /*IdentTy=*/nullptr); - return *this; - } - - [[gnu::flatten, gnu::always_inline]] Value &operator--() { - inc(-1, /*IdentTy=*/nullptr); - return *this; - } - - [[gnu::flatten, gnu::always_inline]] void - assert_eq(const Ty &V, IdentTy *Ident = nullptr, - bool ForceTeamState = false) { - ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr); - } - -private: - [[gnu::flatten, gnu::always_inline]] Ty & - lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { - Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState); - return t; - } - - [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) { - return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) += - UpdateVal); - } - - [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) { - return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) = - UpdateVal); - } - - template <typename VTy, typename Ty2> friend struct ValueRAII; -}; - -/// A mookup class without actual state used to provide -/// a nice interface to lookup and update ICV values -/// we can declare in global scope. -template <typename Ty, ValueKind Kind> struct PtrValue { - [[gnu::flatten, gnu::always_inline]] operator Ty() { - return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, - /*ForceTeamState=*/false); - } - - [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) { - set(Other); - return *this; - } - -private: - Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) { - return lookupPtr(Kind, IsReadonly, ForceTeamState); - } - - Ty &set(Ty UpdateVal) { - return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr, - /*ForceTeamState=*/false) = UpdateVal); - } - - template <typename VTy, typename Ty2> friend struct ValueRAII; -}; - -template <typename VTy, typename Ty> struct ValueRAII { - ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident, - bool ForceTeamState = false) - : Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState) - : (Ty *)utils::UndefPtr), - Val(OldValue), Active(Active) { - if (!Active) - return; - ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!"); - *Ptr = NewValue; - } - ~ValueRAII() { - if (Active) - *Ptr = Val; - } - -private: - Ty *Ptr; - Ty Val; - bool Active; -}; - -/// TODO -inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk; - -/// TODO -inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize; - -/// TODO -inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState; - -/// TODO -inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn> - ParallelRegionFn; - -void runAndCheckState(void(Func(void))); - -void assumeInitialState(bool IsSPMD); - -/// Return the value of the ParallelTeamSize ICV. -int getEffectivePTeamSize(); - -} // namespace state - -namespace icv { - -/// TODO -inline state::Value<uint32_t, state::VK_NThreads> NThreads; - -/// TODO -inline state::Value<uint32_t, state::VK_Level> Level; - -/// The `active-level` describes which of the parallel level counted with the -/// `level-var` is active. There can only be one. -/// -/// active-level-var is 1, if ActiveLevelVar is not 0, otherwise it is 0. -inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel; - -/// TODO -inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels; - -/// TODO -inline state::Value<uint32_t, state::VK_RunSched> RunSched; - -} // namespace icv - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h deleted file mode 100644 index 7e7c8eacb917..000000000000 --- a/offload/DeviceRTL/include/Synchronization.h +++ /dev/null @@ -1,225 +0,0 @@ -//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H -#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H - -#include "DeviceTypes.h" -#include "DeviceUtils.h" - -namespace ompx { -namespace atomic { - -enum OrderingTy { - relaxed = __ATOMIC_RELAXED, - acquire = __ATOMIC_ACQUIRE, - release = __ATOMIC_RELEASE, - acq_rel = __ATOMIC_ACQ_REL, - seq_cst = __ATOMIC_SEQ_CST, -}; - -enum MemScopeTy { - system = __MEMORY_SCOPE_SYSTEM, - device = __MEMORY_SCOPE_DEVICE, - workgroup = __MEMORY_SCOPE_WRKGRP, - wavefront = __MEMORY_SCOPE_WVFRNT, - single = __MEMORY_SCOPE_SINGLE, -}; - -/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device); - -/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The -/// result is stored in \p *Addr; -/// { - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc, - atomic::OrderingTy OrderingFail, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, - OrderingSucc, OrderingFail, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V add(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V load(Ty *Address, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { -#ifdef __NVPTX__ - return __scoped_atomic_fetch_add(Address, V(0), Ordering, MemScope); -#else - return __scoped_atomic_load_n(Address, Ordering, MemScope); -#endif -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -void store(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - __scoped_atomic_store_n(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V mul(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - Ty TypedCurrentVal, TypedResultVal, TypedNewVal; - bool Success; - do { - TypedCurrentVal = atomic::load(Address, Ordering); - TypedNewVal = TypedCurrentVal * Val; - Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, - atomic::relaxed, MemScope); - } while (!Success); - return TypedResultVal; -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<!utils::is_floating_point_v<V>, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, float>, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<float>(max( - (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); - return utils::bitCast<float>(min( - (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, double>, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<double>(max( - (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); - return utils::bitCast<double>(min( - (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<!utils::is_floating_point_v<V>, V> -min(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope); -} - -// TODO: Implement this with __atomic_fetch_max and remove the duplication. -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, float>, V> -min(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<float>(min( - (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); - return utils::bitCast<float>(max( - (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); -} - -// TODO: Implement this with __atomic_fetch_max and remove the duplication. -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, double>, V> -min(Ty *Address, utils::remove_addrspace_t<Ty> Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<double>(min( - (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); - return utils::bitCast<double>(max( - (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope); -} - -static inline uint32_t -atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - uint32_t R; - __scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope); - return R; -} - -///} - -} // namespace atomic - -namespace synchronize { - -/// Initialize the synchronization machinery. Must be called by all threads. -void init(bool IsSPMD); - -/// Synchronize all threads in a warp identified by \p Mask. -void warp(LaneMaskTy Mask); - -/// Synchronize all threads in a block and perform a fence before and after the -/// barrier according to \p Ordering. Note that the fence might be part of the -/// barrier. -void threads(atomic::OrderingTy Ordering); - -/// Synchronizing threads is allowed even if they all hit different instances of -/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more -/// restrictive in that it requires all threads to hit the same instance. The -/// noinline is removed by the openmp-opt pass and helps to preserve the -/// information till then. -///{ - -/// Synchronize all threads in a block, they are reaching the same instruction -/// (hence all threads in the block are "aligned"). Also perform a fence before -/// and after the barrier according to \p Ordering. Note that the -/// fence might be part of the barrier if the target offers this. -[[gnu::noinline, omp::assume("ompx_aligned_barrier")]] void -threadsAligned(atomic::OrderingTy Ordering); - -///} - -} // namespace synchronize - -namespace fence { - -/// Memory fence with \p Ordering semantics for the team. -void team(atomic::OrderingTy Ordering); - -/// Memory fence with \p Ordering semantics for the contention group. -void kernel(atomic::OrderingTy Ordering); - -/// Memory fence with \p Ordering semantics for the system. -void system(atomic::OrderingTy Ordering); - -} // namespace fence - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h deleted file mode 100644 index 554c3271c334..000000000000 --- a/offload/DeviceRTL/include/Workshare.h +++ /dev/null @@ -1,26 +0,0 @@ -//===-------- Workshare.h - OpenMP Workshare interface ------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_WORKSHARE_H -#define OMPTARGET_WORKSHARE_H - -namespace ompx { - -namespace workshare { - -/// Initialize the worksharing machinery. -void init(bool IsSPMD); - -} // namespace workshare - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/generated_microtask_cases.gen b/offload/DeviceRTL/include/generated_microtask_cases.gen deleted file mode 100644 index a05f6da2f84f..000000000000 --- a/offload/DeviceRTL/include/generated_microtask_cases.gen +++ /dev/null @@ -1,797 +0,0 @@ -case 0: -((void (*)(int32_t *, int32_t *))fn)(&global_tid, &bound_tid); -break; -case 1: -((void (*)(int32_t *, int32_t *, void *))fn)(&global_tid, &bound_tid, args[0]); -break; -case 2: -((void (*)(int32_t *, int32_t *, void *, void *))fn)(&global_tid, &bound_tid, - args[0], args[1]); -break; -case 3: -((void (*)(int32_t *, int32_t *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2]); -break; -case 4: -((void (*)(int32_t *, int32_t *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3]); -break; -case 5: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4]); -break; -case 6: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5]); -break; -case 7: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6]); -break; -case 8: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7]); -break; -case 9: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8]); -break; -case 10: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], - args[4], args[5], args[6], - args[7], args[8], args[9]); -break; -case 11: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10]); -break; -case 12: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11]); -break; -case 13: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12]); -break; -case 14: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13]); -break; -case 15: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14]); -break; -case 16: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15]); -break; -case 17: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16]); -break; -case 18: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17]); -break; -case 19: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18]); -break; -case 20: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19]); -break; -case 21: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20]); -break; -case 22: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21]); -break; -case 23: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22]); -break; -case 24: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23]); -break; -case 25: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], - args[21], args[22], args[23], args[24]); -break; -case 26: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25]); -break; -case 27: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26]); -break; -case 28: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27]); -break; -case 29: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28]); -break; -case 30: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29]); -break; -case 31: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30]); -break; -case 32: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], - args[31]); -break; -case 33: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32]); -break; -case 34: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33]); -break; -case 35: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34]); -break; -case 36: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35]); -break; -case 37: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36]); -break; -case 38: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37]); -break; -case 39: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38]); -break; -case 40: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], - args[32], args[33], args[34], args[35], args[36], - args[37], args[38], args[39]); -break; -case 41: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40]); -break; -case 42: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41]); -break; -case 43: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42]); -break; -case 44: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43]); -break; -case 45: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44]); -break; -case 46: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45]); -break; -/// DONE TO HERE -case 47: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], - args[39], args[40], args[41], args[42], args[43], - args[44], args[45], args[46]); -break; -case 48: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47]); -break; -case 49: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48]); -break; -case 50: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49]); -break; -case 51: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50]); -break; -case 52: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51]); -break; -case 53: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52]); -break; -case 54: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53]); -break; -case 55: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54]); -break; -case 56: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], - args[32], args[33], args[34], args[35], args[36], - args[37], args[38], args[39], args[40], args[41], - args[42], args[43], args[44], args[45], args[46], - args[47], args[48], args[49], args[50], args[51], - args[52], args[53], args[54], args[55]); -break; -case 57: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56]); -break; -case 58: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57]); -break; -case 59: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58]); -break; -case 60: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59]); -break; -case 61: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59], args[60]); -break; -case 62: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59], args[60], args[61]); -break; -case 63: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], - args[39], args[40], args[41], args[42], args[43], - args[44], args[45], args[46], args[47], args[48], - args[49], args[50], args[51], args[52], args[53], - args[54], args[55], args[56], args[57], args[58], - args[59], args[60], args[61], args[62]); -break; -case 64: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59], args[60], args[61], - args[62], args[63]); -break; diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp deleted file mode 100644 index aac2a6005158..000000000000 --- a/offload/DeviceRTL/src/Allocator.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Configuration.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Mapping.h" -#include "Synchronization.h" - -using namespace ompx; - -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool; -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy - __omp_rtl_device_memory_pool_tracker; - -/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool -/// directly. -struct BumpAllocatorTy final { - - void *alloc(uint64_t Size) { - Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT)); - - if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) { - atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1, - atomic::seq_cst); - atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size, - atomic::seq_cst); - atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size, - atomic::seq_cst); - atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size, - atomic::seq_cst); - } - - uint64_t *Data = - reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr); - uint64_t End = - reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size; - - uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst); - if (OldData + Size > End) - __builtin_trap(); - - return reinterpret_cast<void *>(OldData); - } - - void free(void *) {} -}; - -BumpAllocatorTy BumpAllocator; - -/// allocator namespace implementation -/// -///{ - -void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) { - // TODO: Check KernelEnvironment for an allocator choice as soon as we have - // more than one. -} - -void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); } - -void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } - -///} diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp deleted file mode 100644 index 0c31c66ab2de..000000000000 --- a/offload/DeviceRTL/src/Configuration.cpp +++ /dev/null @@ -1,85 +0,0 @@ -//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the data object of the constant device environment and the -// query API. -// -//===----------------------------------------------------------------------===// - -#include "Configuration.h" -#include "DeviceTypes.h" -#include "State.h" - -using namespace ompx; - -// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled. -[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_nested_parallelism = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_threads_oversubscription = - 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0; - -// This variable should be visible to the plugin so we override the default -// hidden visibility. -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment; - -uint32_t config::getAssumeTeamsOversubscription() { - return __omp_rtl_assume_teams_oversubscription; -} - -uint32_t config::getAssumeThreadsOversubscription() { - return __omp_rtl_assume_threads_oversubscription; -} - -uint32_t config::getDebugKind() { - return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind; -} - -uint32_t config::getNumDevices() { - return __omp_rtl_device_environment.NumDevices; -} - -uint32_t config::getDeviceNum() { - return __omp_rtl_device_environment.DeviceNum; -} - -uint64_t config::getDynamicMemorySize() { - return __omp_rtl_device_environment.DynamicMemSize; -} - -uint64_t config::getClockFrequency() { - return __omp_rtl_device_environment.ClockFrequency; -} - -void *config::getIndirectCallTablePtr() { - return reinterpret_cast<void *>( - __omp_rtl_device_environment.IndirectCallTable); -} - -uint64_t config::getHardwareParallelism() { - return __omp_rtl_device_environment.HardwareParallelism; -} - -uint64_t config::getIndirectCallTableSize() { - return __omp_rtl_device_environment.IndirectCallTableSize; -} - -bool config::isDebugMode(DeviceDebugKind Kind) { - return config::getDebugKind() & uint32_t(Kind); -} - -bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } - -bool config::mayUseNestedParallelism() { - if (__omp_rtl_assume_no_nested_parallelism) - return false; - return state::getKernelEnvironment().Configuration.MayUseNestedParallelism; -} diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp deleted file mode 100644 index 5b5482d766b1..000000000000 --- a/offload/DeviceRTL/src/Debug.cpp +++ /dev/null @@ -1,44 +0,0 @@ -//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains debug utilities -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Configuration.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" - -using namespace ompx; - -extern "C" { -void __assert_assume(bool condition) { __builtin_assume(condition); } - -#ifndef OMPTARGET_HAS_LIBC -[[gnu::weak]] void __assert_fail(const char *expr, const char *file, - unsigned line, const char *function) { - __assert_fail_internal(expr, nullptr, file, line, function); -} -#endif - -void __assert_fail_internal(const char *expr, const char *msg, const char *file, - unsigned line, const char *function) { - if (msg) { - printf("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function, - msg, expr); - } else { - printf("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr); - } - __builtin_trap(); -} -} diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp deleted file mode 100644 index d6f8c499c890..000000000000 --- a/offload/DeviceRTL/src/DeviceUtils.cpp +++ /dev/null @@ -1,64 +0,0 @@ -//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "DeviceUtils.h" - -#include "Debug.h" -#include "Interface.h" -#include "Mapping.h" -#include "gpuintrin.h" - -using namespace ompx; - -uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { - return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; -} - -void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { - static_assert(sizeof(unsigned long) == 8, ""); - LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL); - HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32); -} - -int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, - int32_t Width) { - return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width); -} - -int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, - int32_t Width) { - int32_t Self = mapping::getThreadIdInWarp(); - int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; - return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); -} - -int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, - int32_t Width) { - int32_t Self = mapping::getThreadIdInWarp(); - int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; - return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); -} - -uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { - return __gpu_ballot(Mask, Pred); -} - -bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); } - -extern "C" { -int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { - return utils::shuffleDown(lanes::All, Val, Delta, SrcLane); -} - -int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { - return utils::shuffleDown(lanes::All, Val, Delta, Width); -} -} diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp deleted file mode 100644 index 8c2828b27041..000000000000 --- a/offload/DeviceRTL/src/Kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the kernel entry points for the device. -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" -#include "Workshare.h" - -using namespace ompx; - -// These flags are copied from "llvm/Frontend/OpenMP/OMPDeviceConstants.h" and -// must be kept in-sync. -enum OMPTgtExecModeFlags : unsigned char { - OMP_TGT_EXEC_MODE_BARE = 0, - OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, - OMP_TGT_EXEC_MODE_SPMD = 1 << 1, - OMP_TGT_EXEC_MODE_GENERIC_SPMD = - OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD, - OMP_TGT_EXEC_MODE_SPMD_NO_LOOP = 1 << 2 | OMP_TGT_EXEC_MODE_SPMD -}; - -static void -inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - // Order is important here. - synchronize::init(IsSPMD); - mapping::init(IsSPMD); - state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment); - allocator::init(IsSPMD, KernelEnvironment); - workshare::init(IsSPMD); -} - -/// Simple generic state machine for worker threads. -static void genericStateMachine(IdentTy *Ident) { - uint32_t TId = mapping::getThreadIdInBlock(); - - do { - ParallelRegionFnTy WorkFn = nullptr; - - // Wait for the signal that we have a new work function. - synchronize::threads(atomic::seq_cst); - - // Retrieve the work function from the runtime. - bool IsActive = __kmpc_kernel_parallel(&WorkFn); - - // If there is nothing more to do, break out of the state machine by - // returning to the caller. - if (!WorkFn) - return; - - if (IsActive) { - ASSERT(!mapping::isSPMDMode(), nullptr); - ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId); - __kmpc_kernel_end_parallel(); - } - - synchronize::threads(atomic::seq_cst); - - } while (true); -} - -extern "C" { - -/// Initialization -/// -/// \param Ident Source location identification, can be NULL. -/// -int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; - bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD; - bool UseGenericStateMachine = Configuration.UseGenericStateMachine; - if (IsSPMD) { - inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, - KernelLaunchEnvironment); - synchronize::threadsAligned(atomic::relaxed); - } else { - inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, - KernelLaunchEnvironment); - // No need to wait since only the main threads will execute user - // code and workers will run into a barrier right away. - } - - if (IsSPMD) { - state::assumeInitialState(IsSPMD); - - // Synchronize to ensure the assertions above are in an aligned region. - // The barrier is eliminated later. - synchronize::threadsAligned(atomic::relaxed); - return -1; - } - - if (mapping::isInitialThreadInLevel0(IsSPMD)) - return -1; - - // Enter the generic state machine if enabled and if this thread can possibly - // be an active worker thread. - // - // The latter check is important for NVIDIA Pascal (but not Volta) and AMD - // GPU. In those cases, a single thread can apparently satisfy a barrier on - // behalf of all threads in the same warp. Thus, it would not be safe for - // other threads in the main thread's warp to reach the first - // synchronize::threads call in genericStateMachine before the main thread - // reaches its corresponding synchronize::threads call: that would permit all - // active worker threads to proceed before the main thread has actually set - // state::ParallelRegionFn, and then they would immediately quit without - // doing any work. mapping::getMaxTeamThreads() does not include any of the - // main thread's warp, so none of its threads can ever be active worker - // threads. - if (UseGenericStateMachine && - mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) - genericStateMachine(KernelEnvironment.Ident); - - return mapping::getThreadIdInBlock(); -} - -/// De-Initialization -/// -/// In non-SPMD, this function releases the workers trapped in a state machine -/// and also any memory dynamically allocated by the runtime. -/// -/// \param Ident Source location identification, can be NULL. -/// -void __kmpc_target_deinit() { - bool IsSPMD = mapping::isSPMDMode(); - if (IsSPMD) - return; - - if (mapping::isInitialThreadInLevel0(IsSPMD)) { - // Signal the workers to exit the state machine and exit the kernel. - state::ParallelRegionFn = nullptr; - } else if (!state::getKernelEnvironment() - .Configuration.UseGenericStateMachine) { - // Retrieve the work function just to ensure we always call - // __kmpc_kernel_parallel even if a custom state machine is used. - // TODO: this is not super pretty. The problem is we create the call to - // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it - // is not there yet. Thus, we assume we never reach it from - // __kmpc_target_deinit. That allows us to remove the store in there to - // ParallelRegionFn, which leads to bad results later on. - ParallelRegionFnTy WorkFn = nullptr; - __kmpc_kernel_parallel(&WorkFn); - ASSERT(WorkFn == nullptr, nullptr); - } -} - -int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } -} diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp deleted file mode 100644 index 83f9233d9480..000000000000 --- a/offload/DeviceRTL/src/LibC.cpp +++ /dev/null @@ -1,48 +0,0 @@ -//===------- LibC.cpp - Simple implementation of libc functions --- C++ ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "LibC.h" - -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) -extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; } -#else -extern "C" int vprintf(const char *format, __builtin_va_list); -#endif - -extern "C" { -[[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) { - auto *L = reinterpret_cast<const unsigned char *>(lhs); - auto *R = reinterpret_cast<const unsigned char *>(rhs); - - for (size_t I = 0; I < count; ++I) - if (L[I] != R[I]) - return (int)L[I] - (int)R[I]; - - return 0; -} - -[[gnu::weak]] void memset(void *dst, int C, size_t count) { - auto *dstc = reinterpret_cast<char *>(dst); - for (size_t I = 0; I < count; ++I) - dstc[I] = C; -} - -[[gnu::weak]] int printf(const char *Format, ...) { - __builtin_va_list vlist; - __builtin_va_start(vlist, Format); - return ::vprintf(Format, vlist); -} -} - -namespace ompx { -[[clang::no_builtin("printf")]] int printf(const char *Format, ...) { - __builtin_va_list vlist; - __builtin_va_start(vlist, Format); - return ::vprintf(Format, vlist); -} -} // namespace ompx diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp deleted file mode 100644 index b145892d1ece..000000000000 --- a/offload/DeviceRTL/src/Mapping.cpp +++ /dev/null @@ -1,212 +0,0 @@ -//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "Mapping.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "State.h" -#include "gpuintrin.h" - -using namespace ompx; - -// FIXME: This resolves the handling for the AMDGPU workgroup size when the ABI -// is set to 'none'. We only support COV5+ but this can be removed when COV4 is -// fully deprecated. -#ifdef __AMDGPU__ -extern const inline uint32_t __oclc_ABI_version = 500; -[[gnu::alias("__oclc_ABI_version")]] const uint32_t __oclc_ABI_version__; -#endif - -static bool isInLastWarp() { - uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) & - ~(mapping::getWarpSize() - 1); - return mapping::getThreadIdInBlock() == MainTId; -} - -bool mapping::isMainThreadInGenericMode(bool IsSPMD) { - if (IsSPMD || icv::Level) - return false; - - // Check if this is the last warp in the block. - return isInLastWarp(); -} - -bool mapping::isMainThreadInGenericMode() { - return mapping::isMainThreadInGenericMode(mapping::isSPMDMode()); -} - -bool mapping::isInitialThreadInLevel0(bool IsSPMD) { - if (IsSPMD) - return mapping::getThreadIdInBlock() == 0; - return isInLastWarp(); -} - -bool mapping::isLeaderInWarp() { - __kmpc_impl_lanemask_t Active = mapping::activemask(); - __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); - return utils::popc(Active & LaneMaskLT) == 0; -} - -LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); } - -LaneMaskTy mapping::lanemaskLT() { -#ifdef __NVPTX__ - return __nvvm_read_ptx_sreg_lanemask_lt(); -#else - uint32_t Lane = mapping::getThreadIdInWarp(); - int64_t Ballot = mapping::activemask(); - uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; - return Mask & Ballot; -#endif -} - -LaneMaskTy mapping::lanemaskGT() { -#ifdef __NVPTX__ - return __nvvm_read_ptx_sreg_lanemask_gt(); -#else - uint32_t Lane = mapping::getThreadIdInWarp(); - if (Lane == (mapping::getWarpSize() - 1)) - return 0; - int64_t Ballot = mapping::activemask(); - uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); - return Mask & Ballot; -#endif -} - -uint32_t mapping::getThreadIdInWarp() { - uint32_t ThreadIdInWarp = __gpu_lane_id(); - ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr); - return ThreadIdInWarp; -} - -uint32_t mapping::getThreadIdInBlock(int32_t Dim) { - uint32_t ThreadIdInBlock = __gpu_thread_id(Dim); - return ThreadIdInBlock; -} - -uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); } - -uint32_t mapping::getMaxTeamThreads(bool IsSPMD) { - uint32_t BlockSize = mapping::getNumberOfThreadsInBlock(); - // If we are in SPMD mode, remove one warp. - return BlockSize - (!IsSPMD * mapping::getWarpSize()); -} -uint32_t mapping::getMaxTeamThreads() { - return mapping::getMaxTeamThreads(mapping::isSPMDMode()); -} - -uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) { - return __gpu_num_threads(Dim); -} - -uint32_t mapping::getNumberOfThreadsInKernel() { - return mapping::getNumberOfThreadsInBlock(0) * - mapping::getNumberOfBlocksInKernel(0) * - mapping::getNumberOfThreadsInBlock(1) * - mapping::getNumberOfBlocksInKernel(1) * - mapping::getNumberOfThreadsInBlock(2) * - mapping::getNumberOfBlocksInKernel(2); -} - -uint32_t mapping::getWarpIdInBlock() { - uint32_t WarpID = - mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); - ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr); - return WarpID; -} - -uint32_t mapping::getBlockIdInKernel(int32_t Dim) { - uint32_t BlockId = __gpu_block_id(Dim); - ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr); - return BlockId; -} - -uint32_t mapping::getNumberOfWarpsInBlock() { - return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); -} - -uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) { - return __gpu_num_blocks(Dim); -} - -uint32_t mapping::getNumberOfProcessorElements() { - return static_cast<uint32_t>(config::getHardwareParallelism()); -} - -///} - -/// Execution mode -/// -///{ - -// TODO: This is a workaround for initialization coming from kernels outside of -// the TU. We will need to solve this more correctly in the future. -[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode; - -void mapping::init(bool IsSPMD) { - if (mapping::isInitialThreadInLevel0(IsSPMD)) - IsSPMDMode = IsSPMD; -} - -bool mapping::isSPMDMode() { return IsSPMDMode; } - -bool mapping::isGenericMode() { return !isSPMDMode(); } -///} - -extern "C" { -[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() { - return mapping::getThreadIdInBlock(); -} - -[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() { - return mapping::getNumberOfThreadsInBlock(mapping::DIM_X); -} - -[[gnu::noinline]] uint32_t __kmpc_get_warp_size() { - return mapping::getWarpSize(); -} -} - -#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \ - extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); } - -_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock) -_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel) -_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock) -_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel) - -extern "C" { -uint64_t ompx_ballot_sync(uint64_t mask, int pred) { - return utils::ballotSync(mask, pred); -} - -int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) { - return utils::shuffleDown(mask, var, delta, width); -} - -float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta, - int width) { - return utils::bitCast<float>( - utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width)); -} - -long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) { - return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width); -} - -double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta, - int width) { - return utils::bitCast<double>( - utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width)); -} -} diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp deleted file mode 100644 index a89f8b2a7453..000000000000 --- a/offload/DeviceRTL/src/Misc.cpp +++ /dev/null @@ -1,138 +0,0 @@ -//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "Allocator.h" -#include "Configuration.h" -#include "DeviceTypes.h" -#include "Shared/RPCOpcodes.h" -#include "shared/rpc.h" - -#include "Debug.h" - -namespace ompx { -namespace impl { - -/// Lookup a device-side function using a host pointer /p HstPtr using the table -/// provided by the device plugin. The table is an ordered pair of host and -/// device pointers sorted on the value of the host pointer. -void *indirectCallLookup(void *HstPtr) { - if (!HstPtr) - return nullptr; - - struct IndirectCallTable { - void *HstPtr; - void *DevPtr; - }; - IndirectCallTable *Table = - reinterpret_cast<IndirectCallTable *>(config::getIndirectCallTablePtr()); - uint64_t TableSize = config::getIndirectCallTableSize(); - - // If the table is empty we assume this is device pointer. - if (!Table || !TableSize) - return HstPtr; - - uint32_t Left = 0; - uint32_t Right = TableSize; - - // If the pointer is definitely not contained in the table we exit early. - if (HstPtr < Table[Left].HstPtr || HstPtr > Table[Right - 1].HstPtr) - return HstPtr; - - while (Left != Right) { - uint32_t Current = Left + (Right - Left) / 2; - if (Table[Current].HstPtr == HstPtr) - return Table[Current].DevPtr; - - if (HstPtr < Table[Current].HstPtr) - Right = Current; - else - Left = Current; - } - - // If we searched the whole table and found nothing this is a device pointer. - return HstPtr; -} - -/// The openmp client instance used to communicate with the server. -[[gnu::visibility("protected"), - gnu::weak]] rpc::Client Client asm("__llvm_rpc_client"); - -} // namespace impl -} // namespace ompx - -/// Interfaces -/// -///{ - -extern "C" { -int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; } - -int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } - -double omp_get_wtick(void) { - // The number of ticks per second for the AMDGPU clock varies by card and can - // only be retrieved by querying the driver. We rely on the device environment - // to inform us what the proper frequency is. NVPTX uses a nanosecond - // resolution, we could omit the global read but this makes it consistent. - return 1.0 / ompx::config::getClockFrequency(); -} - -double omp_get_wtime(void) { - return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick(); -} - -void *__llvm_omp_indirect_call_lookup(void *HstPtr) { - return ompx::impl::indirectCallLookup(HstPtr); -} - -void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { - switch (allocator) { - case omp_default_mem_alloc: - case omp_large_cap_mem_alloc: - case omp_const_mem_alloc: - case omp_high_bw_mem_alloc: - case omp_low_lat_mem_alloc: - return malloc(size); - default: - return nullptr; - } -} - -void omp_free(void *ptr, omp_allocator_handle_t allocator) { - switch (allocator) { - case omp_default_mem_alloc: - case omp_large_cap_mem_alloc: - case omp_const_mem_alloc: - case omp_high_bw_mem_alloc: - case omp_low_lat_mem_alloc: - free(ptr); - case omp_null_allocator: - default: - return; - } -} - -unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) { - rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_HOST_CALL>(); - Port.send_n(data, size); - Port.send([=](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = reinterpret_cast<uintptr_t>(fn); - }); - unsigned long long Ret; - Port.recv([&](rpc::Buffer *Buffer, uint32_t) { - Ret = static_cast<unsigned long long>(Buffer->data[0]); - }); - Port.close(); - return Ret; -} -} - -///} diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp deleted file mode 100644 index 0ea2f89337fe..000000000000 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ /dev/null @@ -1,377 +0,0 @@ -//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Parallel implementation in the GPU. Here is the pattern: -// -// while (not finished) { -// -// if (master) { -// sequential code, decide which par loop to do, or if finished -// __kmpc_kernel_prepare_parallel() // exec by master only -// } -// syncthreads // A -// __kmpc_kernel_parallel() // exec by all -// if (this thread is included in the parallel) { -// switch () for all parallel loops -// __kmpc_kernel_end_parallel() // exec only by threads in parallel -// } -// -// -// The reason we don't exec end_parallel for the threads not included -// in the parallel loop is that for each barrier in the parallel -// region, these non-included threads will cycle through the -// syncthread A. Thus they must preserve their current threadId that -// is larger than thread in team. -// -// To make a long story short... -// -//===----------------------------------------------------------------------===// - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -namespace { - -void numThreadsStrictError(int32_t nt_strict, int32_t nt_severity, - const char *nt_message, int32_t requested, - int32_t actual) { - if (nt_message) - printf("%s\n", nt_message); - else - printf("The computed number of threads (%u) does not match the requested " - "number of threads (%d). Consider that it might not be supported " - "to select exactly %d threads on this target device.\n", - actual, requested, requested); - if (nt_severity == severity_fatal) - __builtin_trap(); -} - -uint32_t determineNumberOfThreads(int32_t NumThreadsClause, - int32_t nt_strict = false, - int32_t nt_severity = severity_fatal, - const char *nt_message = nullptr) { - uint32_t NThreadsICV = - NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; - uint32_t NumThreads = mapping::getMaxTeamThreads(); - - if (NThreadsICV != 0 && NThreadsICV < NumThreads) - NumThreads = NThreadsICV; - - // SPMD mode allows any number of threads, for generic mode we round down to a - // multiple of WARPSIZE since it is legal to do so in OpenMP. - if (!mapping::isSPMDMode()) { - if (NumThreads < mapping::getWarpSize()) - NumThreads = 1; - else - NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); - } - - if (NumThreadsClause != -1 && nt_strict && - NumThreads != static_cast<uint32_t>(NumThreadsClause)) - numThreadsStrictError(nt_strict, nt_severity, nt_message, NumThreadsClause, - NumThreads); - - return NumThreads; -} - -// Invoke an outlined parallel function unwrapping arguments (up to 32). -[[clang::always_inline]] void invokeMicrotask(int32_t global_tid, - int32_t bound_tid, void *fn, - void **args, int64_t nargs) { - switch (nargs) { -#include "generated_microtask_cases.gen" - default: - printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); - __builtin_trap(); - } -} - -} // namespace - -extern "C" { - -[[clang::always_inline]] void __kmpc_parallel_spmd_impl( - IdentTy *ident, int32_t num_threads, void *fn, void **args, - const int64_t nargs, int32_t nt_strict = false, - int32_t nt_severity = severity_fatal, const char *nt_message = nullptr) { - uint32_t TId = mapping::getThreadIdInBlock(); - uint32_t NumThreads = - determineNumberOfThreads(num_threads, nt_strict, nt_severity, nt_message); - uint32_t PTeamSize = - NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; - // Avoid the race between the read of the `icv::Level` above and the write - // below by synchronizing all threads here. - synchronize::threadsAligned(atomic::seq_cst); - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - - // Synchronize all threads after the main thread (TId == 0) set up the - // team state properly. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(PTeamSize, ident, - /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize before we run user code to avoid invalidating the - // assumptions above. - synchronize::threadsAligned(atomic::relaxed); - - if (!PTeamSize || TId < PTeamSize) - invokeMicrotask(TId, 0, fn, args, nargs); - - // Synchronize all threads at the end of a parallel region. - synchronize::threadsAligned(atomic::seq_cst); - } - - // Synchronize all threads to make sure every thread exits the scope above; - // otherwise the following assertions and the assumption in - // __kmpc_target_deinit may not hold. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize to create an aligned region around the assumptions. - synchronize::threadsAligned(atomic::relaxed); - - return; -} - -[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, - int32_t num_threads, - void *fn, void **args, - const int64_t nargs) { - return __kmpc_parallel_spmd_impl(ident, num_threads, fn, args, nargs); -} - -[[clang::always_inline]] void __kmpc_parallel_spmd_60( - IdentTy *ident, int32_t num_threads, void *fn, void **args, - const int64_t nargs, int32_t nt_strict = false, - int32_t nt_severity = severity_fatal, const char *nt_message = nullptr) { - return __kmpc_parallel_spmd_impl(ident, num_threads, fn, args, nargs, - nt_strict, nt_severity, nt_message); -} - -[[clang::always_inline]] void __kmpc_parallel_impl( - IdentTy *ident, int32_t, int32_t if_expr, int32_t num_threads, - int proc_bind, void *fn, void *wrapper_fn, void **args, int64_t nargs, - int32_t nt_strict = false, int32_t nt_severity = severity_fatal, - const char *nt_message = nullptr) { - uint32_t TId = mapping::getThreadIdInBlock(); - - // Assert the parallelism level is zero if disabled by the user. - ASSERT((config::mayUseNestedParallelism() || icv::Level == 0), - "nested parallelism while disabled"); - - // Handle the serialized case first, same for SPMD/non-SPMD: - // 1) if-clause(0) - // 2) parallel in task or other thread state inducing construct - // 3) nested parallel regions - if (OMP_UNLIKELY(!if_expr || state::HasThreadState || - (config::mayUseNestedParallelism() && icv::Level))) { - // OpenMP 6.0 12.1.2 requires the num_threads 'strict' modifier to also have - // effect when parallel execution is disabled by a corresponding if clause - // attached to the parallel directive. - if (nt_strict && num_threads > 1) - numThreadsStrictError(nt_strict, nt_severity, nt_message, num_threads, 1); - state::DateEnvironmentRAII DERAII(ident); - ++icv::Level; - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - // From this point forward we know that there is no thread state used. - ASSERT(state::HasThreadState == false, nullptr); - - if (mapping::isSPMDMode()) { - // This was moved to its own routine so it could be called directly - // in certain situations to avoid resource consumption of unused - // logic in parallel_51. - if (nt_strict) - __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs); - else - __kmpc_parallel_spmd_60(ident, num_threads, fn, args, nargs, nt_strict, - nt_severity, nt_message); - - return; - } - - uint32_t NumThreads = - determineNumberOfThreads(num_threads, nt_strict, nt_severity, nt_message); - uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); - uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; - - // We do *not* create a new data environment because all threads in the team - // that are active are now running this parallel region. They share the - // TeamState, which has an increase level-var and potentially active-level - // set, but they do not have individual ThreadStates yet. If they ever - // modify the ICVs beyond this point a ThreadStates will be allocated. - - bool IsActiveParallelRegion = NumThreads > 1; - if (!IsActiveParallelRegion) { - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - void **GlobalArgs = nullptr; - if (nargs) { - __kmpc_begin_sharing_variables(&GlobalArgs, nargs); - switch (nargs) { - default: - for (int I = 0; I < nargs; I++) - GlobalArgs[I] = args[I]; - break; - case 16: - GlobalArgs[15] = args[15]; - [[fallthrough]]; - case 15: - GlobalArgs[14] = args[14]; - [[fallthrough]]; - case 14: - GlobalArgs[13] = args[13]; - [[fallthrough]]; - case 13: - GlobalArgs[12] = args[12]; - [[fallthrough]]; - case 12: - GlobalArgs[11] = args[11]; - [[fallthrough]]; - case 11: - GlobalArgs[10] = args[10]; - [[fallthrough]]; - case 10: - GlobalArgs[9] = args[9]; - [[fallthrough]]; - case 9: - GlobalArgs[8] = args[8]; - [[fallthrough]]; - case 8: - GlobalArgs[7] = args[7]; - [[fallthrough]]; - case 7: - GlobalArgs[6] = args[6]; - [[fallthrough]]; - case 6: - GlobalArgs[5] = args[5]; - [[fallthrough]]; - case 5: - GlobalArgs[4] = args[4]; - [[fallthrough]]; - case 4: - GlobalArgs[3] = args[3]; - [[fallthrough]]; - case 3: - GlobalArgs[2] = args[2]; - [[fallthrough]]; - case 2: - GlobalArgs[1] = args[1]; - [[fallthrough]]; - case 1: - GlobalArgs[0] = args[0]; - [[fallthrough]]; - case 0: - break; - } - } - - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, - (void *)nullptr, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - - // Master signals work to activate workers. - synchronize::threads(atomic::seq_cst); - // Master waits for workers to signal. - synchronize::threads(atomic::seq_cst); - } - - if (nargs) - __kmpc_end_sharing_variables(); -} - -[[clang::always_inline]] void -__kmpc_parallel_51(IdentTy *ident, int32_t id, int32_t if_expr, - int32_t num_threads, int proc_bind, void *fn, - void *wrapper_fn, void **args, int64_t nargs) { - return __kmpc_parallel_impl(ident, id, if_expr, num_threads, proc_bind, fn, - wrapper_fn, args, nargs); -} - -[[clang::always_inline]] void __kmpc_parallel_60( - IdentTy *ident, int32_t id, int32_t if_expr, int32_t num_threads, - int proc_bind, void *fn, void *wrapper_fn, void **args, int64_t nargs, - int32_t nt_strict = false, int32_t nt_severity = severity_fatal, - const char *nt_message = nullptr) { - return __kmpc_parallel_impl(ident, id, if_expr, num_threads, proc_bind, fn, - wrapper_fn, args, nargs, nt_strict, nt_severity, - nt_message); -} - -[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { - // Work function and arguments for L1 parallel region. - *WorkFn = state::ParallelRegionFn; - - // If this is the termination signal from the master, quit early. - if (!*WorkFn) - return false; - - // Set to true for workers participating in the parallel region. - uint32_t TId = mapping::getThreadIdInBlock(); - bool ThreadIsActive = TId < state::getEffectivePTeamSize(); - return ThreadIsActive; -} - -[[clang::noinline]] void __kmpc_kernel_end_parallel() { - // In case we have modified an ICV for this thread before a ThreadState was - // created. We drop it now to not contaminate the next parallel region. - ASSERT(!mapping::isSPMDMode(), nullptr); - uint32_t TId = mapping::getThreadIdInBlock(); - state::resetStateForThread(TId); - ASSERT(!mapping::isSPMDMode(), nullptr); -} - -uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } - -int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } - -void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, - int32_t thread_limit) {} - -void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} -} diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp deleted file mode 100644 index df141af5ebee..000000000000 --- a/offload/DeviceRTL/src/Profiling.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===------- Profiling.cpp ---------------------------------------- C++ ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "Profiling.h" - -extern "C" { - -// Provides empty implementations for certain functions in compiler-rt -// that are emitted by the PGO instrumentation. -void __llvm_profile_register_function(void *Ptr) {} -void __llvm_profile_register_names_function(void *Ptr, long int I) {} -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} -} diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp deleted file mode 100644 index fffd0063940c..000000000000 --- a/offload/DeviceRTL/src/Reduction.cpp +++ /dev/null @@ -1,316 +0,0 @@ -//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of reduction with KMPC interface. -// -//===----------------------------------------------------------------------===// - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -namespace { - -void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { - for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { - shflFct(reduce_data, /*LaneId - not used= */ 0, - /*Offset = */ mask, /*AlgoVersion=*/0); - } -} - -void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, - uint32_t size, uint32_t tid) { - uint32_t curr_size; - uint32_t mask; - curr_size = size; - mask = curr_size / 2; - while (mask > 0) { - shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); - curr_size = (curr_size + 1) / 2; - mask = curr_size / 2; - } -} - -static uint32_t gpu_irregular_simd_reduce(void *reduce_data, - ShuffleReductFnTy shflFct) { - uint32_t size, remote_id, physical_lane_id; - physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); - __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); - __kmpc_impl_lanemask_t Liveness = mapping::activemask(); - uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; - __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); - do { - Liveness = mapping::activemask(); - remote_id = utils::ffs(Liveness & lanemask_gt); - size = utils::popc(Liveness); - logical_lane_id /= 2; - shflFct(reduce_data, /*LaneId =*/logical_lane_id, - /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); - } while (logical_lane_id % 2 == 0 && size > 1); - return (logical_lane_id == 0); -} - -static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { - uint32_t BlockThreadId = mapping::getThreadIdInBlock(); - if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false)) - BlockThreadId = 0; - uint32_t NumThreads = omp_get_num_threads(); - if (NumThreads == 1) - return 1; - - // - // This reduce function handles reduction within a team. It handles - // parallel regions in both L1 and L2 parallelism levels. It also - // supports Generic, SPMD, and NoOMP modes. - // - // 1. Reduce within a warp. - // 2. Warp master copies value to warp 0 via shared memory. - // 3. Warp 0 reduces to a single value. - // 4. The reduced value is available in the thread that returns 1. - // - -#if __has_builtin(__nvvm_reflect) - if (__nvvm_reflect("__CUDA_ARCH") >= 700) { - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - uint32_t WarpId = mapping::getWarpIdInBlock(); - - // Volta execution model: - // For the Generic execution mode a parallel region either has 1 thread and - // beyond that, always a multiple of 32. For the SPMD execution mode we may - // have any number of threads. - if ((NumThreads % mapping::getWarpSize() == 0) || - (WarpId < WarpsNeeded - 1)) - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce( - reduce_data, shflFct, - /*LaneCount=*/NumThreads % mapping::getWarpSize(), - /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - } - return BlockThreadId == 0; - } -#endif - __kmpc_impl_lanemask_t Liveness = mapping::activemask(); - if (Liveness == lanes::All) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/utils::popc(Liveness), - /*LaneId=*/mapping::getThreadIdInBlock() % - mapping::getWarpSize()); - else { // Dispersed lanes. Only threads in L2 - // parallel region may enter here; return - // early. - return gpu_irregular_simd_reduce(reduce_data, shflFct); - } - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - - return BlockThreadId == 0; - } - - // Get the OMP thread Id. This is different from BlockThreadId in the case - // of an L2 parallel region. - return BlockThreadId == 0; -} - -uint32_t roundToWarpsize(uint32_t s) { - if (s < mapping::getWarpSize()) - return 1; - return (s & ~(unsigned)(mapping::getWarpSize() - 1)); -} - -uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } - -} // namespace - -extern "C" { -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, - uint64_t reduce_data_size, - void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { - return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); -} - -int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, - uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, - ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) { - // Terminate all threads in non-SPMD mode except for the master thread. - uint32_t ThreadId = mapping::getThreadIdInBlock(); - if (mapping::isGenericMode()) { - if (!mapping::isMainThreadInGenericMode()) - return 0; - ThreadId = 0; - } - - uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt; - uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; - - // In non-generic mode all workers participate in the teams reduction. - // In generic mode only the team master participates in the teams - // reduction because the workers are waiting for parallel work. - uint32_t NumThreads = omp_get_num_threads(); - uint32_t TeamId = omp_get_team_num(); - uint32_t NumTeams = omp_get_num_teams(); - [[clang::loader_uninitialized]] static Local<unsigned> Bound; - [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount; - - // Block progress for teams greater than the current upper - // limit. We always only allow a number of teams less or equal - // to the number of slots in the buffer. - bool IsMaster = (ThreadId == 0); - while (IsMaster) { - Bound = atomic::load(&IterCnt, atomic::acquire); - if (TeamId < Bound + num_of_records) - break; - } - - if (IsMaster) { - int ModBockId = TeamId % num_of_records; - if (TeamId < num_of_records) { - lgcpyFct(GlobalBuffer, ModBockId, reduce_data); - } else - lgredFct(GlobalBuffer, ModBockId, reduce_data); - - // Propagate the memory writes above to the world. - fence::kernel(atomic::release); - - // Increment team counter. - // This counter is incremented by all teams in the current - // num_of_records chunk. - ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, - atomic::MemScopeTy::device); - } - - // Synchronize in SPMD mode as in generic mode all but 1 threads are in the - // state machine. - if (mapping::isSPMDMode()) - synchronize::threadsAligned(atomic::acq_rel); - - // reduce_data is global or shared so before being reduced within the - // warp we need to bring it in local memory: - // local_reduce_data = reduce_data[i] - // - // Example for 3 reduction variables a, b, c (of potentially different - // types): - // - // buffer layout (struct of arrays): - // a, a, ..., a, b, b, ... b, c, c, ... c - // |__________| - // num_of_records - // - // local_data_reduce layout (struct): - // a, b, c - // - // Each thread will have a local struct containing the values to be - // reduced: - // 1. do reduction within each warp. - // 2. do reduction across warps. - // 3. write the final result to the main reduction variable - // by returning 1 in the thread holding the reduction result. - - // Check if this is the very last team. - unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); - if (ChunkTeamCount == NumTeams - Bound - 1) { - // Ensure we see the global memory writes by other teams - fence::kernel(atomic::acquire); - - // - // Last team processing. - // - if (ThreadId >= NumRecs) - return 0; - NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); - if (ThreadId >= NumThreads) - return 0; - - // Load from buffer and reduce. - glcpyFct(GlobalBuffer, ThreadId, reduce_data); - for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) - glredFct(GlobalBuffer, i, reduce_data); - - // Reduce across warps to the warp master. - if (NumThreads > 1) { - gpu_regular_warp_reduce(reduce_data, shflFct); - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); - if (ActiveThreads > mapping::getWarpSize()) { - uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = ThreadId / mapping::getWarpSize(); - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - ThreadId); - } - } - - if (IsMaster) { - Cnt = 0; - IterCnt = 0; - return 1; - } - return 0; - } - if (IsMaster && ChunkTeamCount == num_of_records - 1) { - // Allow SIZE number of teams to proceed writing their - // intermediate results to the global buffer. - atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst); - } - - return 0; -} -} - -void *__kmpc_reduction_get_fixed_buffer() { - return state::getKernelLaunchEnvironment().ReductionBuffer; -} diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp deleted file mode 100644 index 475395102f47..000000000000 --- a/offload/DeviceRTL/src/State.cpp +++ /dev/null @@ -1,482 +0,0 @@ -//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Configuration.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -/// Memory implementation -/// -///{ - -/// External symbol to access dynamic shared memory. -[[gnu::aligned( - allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[]; - -/// The kernel environment passed to the init method by the compiler. -[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *> - KernelEnvironmentPtr; - -/// The kernel launch environment passed as argument to the kernel by the -/// runtime. -[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *> - KernelLaunchEnvironmentPtr; - -///} - -namespace { - -/// Fallback implementations are missing to trigger a link time error. -/// Implementations for new devices, including the host, should go into a -/// dedicated begin/end declare variant. -/// -///{ -extern "C" { -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) - -[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } -[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } - -#else - -[[gnu::weak, gnu::leaf]] void *malloc(size_t Size); -[[gnu::weak, gnu::leaf]] void free(void *Ptr); - -#endif -} -///} - -/// A "smart" stack in shared memory. -/// -/// The stack exposes a malloc/free interface but works like a stack internally. -/// In fact, it is a separate stack *per warp*. That means, each warp must push -/// and pop symmetrically or this breaks, badly. The implementation will (aim -/// to) detect non-lock-step warps and fallback to malloc/free. The same will -/// happen if a warp runs out of memory. The master warp in generic memory is -/// special and is given more memory than the rest. -/// -struct SharedMemorySmartStackTy { - /// Initialize the stack. Must be called by all threads. - void init(bool IsSPMD); - - /// Allocate \p Bytes on the stack for the encountering thread. Each thread - /// can call this function. - void *push(uint64_t Bytes); - - /// Deallocate the last allocation made by the encountering thread and pointed - /// to by \p Ptr from the stack. Each thread can call this function. - void pop(void *Ptr, uint64_t Bytes); - -private: - /// Compute the size of the storage space reserved for a thread. - uint32_t computeThreadStorageTotal() { - uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); - return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock, - allocator::ALIGNMENT); - } - - /// Return the top address of the warp data stack, that is the first address - /// this warp will allocate memory at next. - void *getThreadDataTop(uint32_t TId) { - return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; - } - - /// The actual storage, shared among all warps. - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize]; - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; -}; - -static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, - "Shared scratchpad of this size not supported yet."); - -/// The allocation of a single shared memory scratchpad. -[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy> - SharedMemorySmartStack; - -void SharedMemorySmartStackTy::init(bool IsSPMD) { - Usage[mapping::getThreadIdInBlock()] = 0; -} - -void *SharedMemorySmartStackTy::push(uint64_t Bytes) { - // First align the number of requested bytes. - /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to - /// be passed in as an argument and the stack rewritten to support it. - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - - uint32_t StorageTotal = computeThreadStorageTotal(); - - // The main thread in generic mode gets the space of its entire warp as the - // other threads do not participate in any computation at all. - if (mapping::isMainThreadInGenericMode()) - StorageTotal *= mapping::getWarpSize(); - - int TId = mapping::getThreadIdInBlock(); - if (Usage[TId] + AlignedBytes <= StorageTotal) { - void *Ptr = getThreadDataTop(TId); - Usage[TId] += AlignedBytes; - return Ptr; - } - - if (config::isDebugMode(DeviceDebugKind::CommonIssues)) - printf("Shared memory stack full, fallback to dynamic allocation of global " - "memory will negatively impact performance.\n"); - void *GlobalMemory = memory::allocGlobal( - AlignedBytes, "Slow path shared memory allocation, insufficient " - "shared memory stack memory!"); - ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!"); - - return GlobalMemory; -} - -void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - if (utils::isSharedMemPtr(Ptr)) { - int TId = mapping::getThreadIdInBlock(); - Usage[TId] -= AlignedBytes; - return; - } - memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); -} - -} // namespace - -void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } - -void *memory::allocShared(uint64_t Bytes, const char *Reason) { - return SharedMemorySmartStack.push(Bytes); -} - -void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { - SharedMemorySmartStack.pop(Ptr, Bytes); -} - -void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { - void *Ptr = malloc(Bytes); - if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) - printf("nullptr returned by malloc!\n"); - return Ptr; -} - -void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } - -///} - -bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { - return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & - (ActiveLevelVar == Other.ActiveLevelVar) & - (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & - (RunSchedVar == Other.RunSchedVar) & - (RunSchedChunkVar == Other.RunSchedChunkVar); -} - -void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { - ASSERT(NThreadsVar == Other.NThreadsVar, nullptr); - ASSERT(LevelVar == Other.LevelVar, nullptr); - ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr); - ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr); - ASSERT(RunSchedVar == Other.RunSchedVar, nullptr); - ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr); -} - -void state::TeamStateTy::init(bool IsSPMD) { - ICVState.NThreadsVar = 0; - ICVState.LevelVar = 0; - ICVState.ActiveLevelVar = 0; - ICVState.Padding0Val = 0; - ICVState.MaxActiveLevelsVar = 1; - ICVState.RunSchedVar = omp_sched_static; - ICVState.RunSchedChunkVar = 1; - ParallelTeamSize = 1; - HasThreadState = false; - ParallelRegionFnVar = nullptr; -} - -bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { - return (ICVState == Other.ICVState) & - (HasThreadState == Other.HasThreadState) & - (ParallelTeamSize == Other.ParallelTeamSize); -} - -void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { - ICVState.assertEqual(Other.ICVState); - ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); - ASSERT(HasThreadState == Other.HasThreadState, nullptr); -} - -[[clang::loader_uninitialized]] Local<state::TeamStateTy> - ompx::state::TeamState; -[[clang::loader_uninitialized]] Local<state::ThreadStateTy **> - ompx::state::ThreadStates; - -namespace { - -int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, - int OutOfBoundsVal = -1) { - if (Level == 0) - return DefaultVal; - int LevelVar = omp_get_level(); - if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) - return OutOfBoundsVal; - int ActiveLevel = icv::ActiveLevel; - if (OMP_UNLIKELY(Level != ActiveLevel)) - return DefaultVal; - return Val; -} - -} // namespace - -void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - SharedMemorySmartStack.init(IsSPMD); - if (mapping::isInitialThreadInLevel0(IsSPMD)) { - TeamState.init(IsSPMD); - ThreadStates = nullptr; - KernelEnvironmentPtr = &KernelEnvironment; - KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; - } -} - -KernelEnvironmentTy &state::getKernelEnvironment() { - return *KernelEnvironmentPtr; -} - -KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { - return *KernelLaunchEnvironmentPtr; -} - -void state::enterDataEnvironment(IdentTy *Ident) { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - if (!config::mayUseThreadStates()) - return; - - unsigned TId = mapping::getThreadIdInBlock(); - ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>( - memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc")); - uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates); - if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { - uint32_t Bytes = - sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock(); - void *ThreadStatesPtr = - memory::allocGlobal(Bytes, "Thread state array allocation"); - __builtin_memset(ThreadStatesPtr, 0, Bytes); - if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), - reinterpret_cast<uintptr_t>(ThreadStatesPtr), - atomic::seq_cst, atomic::seq_cst)) - memory::freeGlobal(ThreadStatesPtr, - "Thread state array allocated multiple times"); - ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst), - "Expected valid thread states bit!"); - } - NewThreadState->init(ThreadStates[TId]); - TeamState.HasThreadState = true; - ThreadStates[TId] = NewThreadState; -} - -void state::exitDataEnvironment() { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - - unsigned TId = mapping::getThreadIdInBlock(); - resetStateForThread(TId); -} - -void state::resetStateForThread(uint32_t TId) { - if (!config::mayUseThreadStates()) - return; - if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) - return; - - ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; - memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc"); - ThreadStates[TId] = PreviousThreadState; -} - -void state::runAndCheckState(void(Func(void))) { - TeamStateTy OldTeamState = TeamState; - OldTeamState.assertEqual(TeamState); - - Func(); - - OldTeamState.assertEqual(TeamState); -} - -void state::assumeInitialState(bool IsSPMD) { - TeamStateTy InitialTeamState; - InitialTeamState.init(IsSPMD); - InitialTeamState.assertEqual(TeamState); - ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr); -} - -int state::getEffectivePTeamSize() { - int PTeamSize = state::ParallelTeamSize; - return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); -} - -extern "C" { -void omp_set_dynamic(int V) {} - -int omp_get_dynamic(void) { return 0; } - -void omp_set_num_threads(int V) { icv::NThreads = V; } - -int omp_get_max_threads(void) { - int NT = icv::NThreads; - return NT > 0 ? NT : mapping::getMaxTeamThreads(); -} - -int omp_get_level(void) { - int LevelVar = icv::Level; - ASSERT(LevelVar >= 0, nullptr); - return LevelVar; -} - -int omp_get_active_level(void) { return !!icv::ActiveLevel; } - -int omp_in_parallel(void) { return !!icv::ActiveLevel; } - -void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { - *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); - *ChunkSize = state::RunSchedChunk; -} - -void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { - icv::RunSched = (int)ScheduleKind; - state::RunSchedChunk = ChunkSize; -} - -int omp_get_ancestor_thread_num(int Level) { - return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); -} - -int omp_get_thread_num(void) { - return omp_get_ancestor_thread_num(omp_get_level()); -} - -int omp_get_team_size(int Level) { - return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); -} - -int omp_get_num_threads(void) { - return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); -} - -int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } - -int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } - -void omp_set_nested(int) {} - -int omp_get_nested(void) { return false; } - -void omp_set_max_active_levels(int Levels) { - icv::MaxActiveLevels = Levels > 0 ? 1 : 0; -} - -int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } - -omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } - -int omp_get_num_places(void) { return 0; } - -int omp_get_place_num_procs(int) { return omp_get_num_procs(); } - -void omp_get_place_proc_ids(int, int *) { - // TODO -} - -int omp_get_place_num(void) { return 0; } - -int omp_get_partition_num_places(void) { return 0; } - -void omp_get_partition_place_nums(int *) { - // TODO -} - -int omp_get_cancellation(void) { return 0; } - -void omp_set_default_device(int) {} - -int omp_get_default_device(void) { return -1; } - -int omp_get_num_devices(void) { return config::getNumDevices(); } - -int omp_get_device_num(void) { return config::getDeviceNum(); } - -int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } - -int omp_get_team_num() { return mapping::getBlockIdInKernel(); } - -int omp_get_initial_device(void) { return -1; } - -int omp_is_initial_device(void) { return 0; } -} - -extern "C" { -[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) { - return memory::allocShared(Bytes, "Frontend alloc shared"); -} - -[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { - memory::freeShared(Ptr, Bytes, "Frontend free shared"); -} - -void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } - -void *llvm_omp_target_dynamic_shared_alloc() { - return __kmpc_get_dynamic_shared(); -} - -void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } - -/// Allocate storage in shared memory to communicate arguments from the main -/// thread to the workers in generic mode. If we exceed -/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. -constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; - -[[clang::loader_uninitialized]] static Local<void *> - SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; -[[clang::loader_uninitialized]] static Local<void **> - SharedMemVariableSharingSpacePtr; - -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { - if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { - SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; - } else { - SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( - nArgs * sizeof(void *), "new extended args"); - ASSERT(SharedMemVariableSharingSpacePtr != nullptr, - "Nullptr returned by malloc!"); - } - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} - -void __kmpc_end_sharing_variables() { - if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) - memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); -} - -void __kmpc_get_shared_variables(void ***GlobalArgs) { - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} -} diff --git a/offload/DeviceRTL/src/Stub.cpp b/offload/DeviceRTL/src/Stub.cpp deleted file mode 100644 index e833423eb265..000000000000 --- a/offload/DeviceRTL/src/Stub.cpp +++ /dev/null @@ -1 +0,0 @@ -// This is an empty file used to create a device fatbinary. diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp deleted file mode 100644 index 2f1ed34a3f6d..000000000000 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ /dev/null @@ -1,379 +0,0 @@ -//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Include all synchronization. -// -//===----------------------------------------------------------------------===// - -#include "Synchronization.h" - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" - -using namespace ompx; - -namespace impl { - -/// Atomics -/// -///{ -///} - -/// AMDGCN Implementation -/// -///{ -#ifdef __AMDGPU__ - -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - // builtin_amdgcn_atomic_inc32 should expand to this switch when - // passed a runtime value, but does not do so yet. Workaround here. - -#define ScopeSwitch(ORDER) \ - switch (MemScope) { \ - case atomic::MemScopeTy::system: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \ - case atomic::MemScopeTy::device: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \ - case atomic::MemScopeTy::workgroup: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \ - case atomic::MemScopeTy::wavefront: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "wavefront"); \ - case atomic::MemScopeTy::single: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "singlethread"); \ - } - -#define Case(ORDER) \ - case ORDER: \ - ScopeSwitch(ORDER) - - switch (Ordering) { - default: - __builtin_unreachable(); - Case(atomic::relaxed); - Case(atomic::acquire); - Case(atomic::release); - Case(atomic::acq_rel); - Case(atomic::seq_cst); -#undef Case -#undef ScopeSwitch - } -} - -[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker; - -void namedBarrierInit() { - // Don't have global ctors, and shared memory is not zero init - atomic::store(&namedBarrierTracker, 0u, atomic::release); -} - -void namedBarrier() { - uint32_t NumThreads = omp_get_num_threads(); - // assert(NumThreads % 32 == 0); - - uint32_t WarpSize = mapping::getWarpSize(); - uint32_t NumWaves = NumThreads / WarpSize; - - fence::team(atomic::acquire); - - // named barrier implementation for amdgcn. - // Uses two 16 bit unsigned counters. One for the number of waves to have - // reached the barrier, and one to count how many times the barrier has been - // passed. These are packed in a single atomically accessed 32 bit integer. - // Low bits for the number of waves, assumed zero before this call. - // High bits to count the number of times the barrier has been passed. - - // precondition: NumWaves != 0; - // invariant: NumWaves * WarpSize == NumThreads; - // precondition: NumWaves < 0xffffu; - - // Increment the low 16 bits once, using the lowest active thread. - if (mapping::isLeaderInWarp()) { - uint32_t load = atomic::add(&namedBarrierTracker, 1, - atomic::relaxed); // commutative - - // Record the number of times the barrier has been passed - uint32_t generation = load & 0xffff0000u; - - if ((load & 0x0000ffffu) == (NumWaves - 1)) { - // Reached NumWaves in low bits so this is the last wave. - // Set low bits to zero and increment high bits - load += 0x00010000u; // wrap is safe - load &= 0xffff0000u; // because bits zeroed second - - // Reset the wave counter and release the waiting waves - atomic::store(&namedBarrierTracker, load, atomic::relaxed); - } else { - // more waves still to go, spin until generation counter changes - do { - __builtin_amdgcn_s_sleep(0); - load = atomic::load(&namedBarrierTracker, atomic::relaxed); - } while ((load & 0xffff0000u) == generation); - } - } - fence::team(atomic::release); -} - -void fenceTeam(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::workgroup); -} - -void fenceKernel(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::device); -} - -void fenceSystem(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::system); -} - -void syncWarp(__kmpc_impl_lanemask_t) { - // This is a no-op on current AMDGPU hardware but it is used by the optimizer - // to enforce convergent behaviour between control flow graphs. - __builtin_amdgcn_wave_barrier(); -} - -void syncThreads(atomic::OrderingTy Ordering) { - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst); - - __builtin_amdgcn_s_barrier(); - - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst); -} -void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } - -// TODO: Don't have wavefront lane locks. Possibly can't have them. -void unsetLock(omp_lock_t *) { __builtin_trap(); } -int testLock(omp_lock_t *) { __builtin_trap(); } -void initLock(omp_lock_t *) { __builtin_trap(); } -void destroyLock(omp_lock_t *) { __builtin_trap(); } -void setLock(omp_lock_t *) { __builtin_trap(); } - -constexpr uint32_t UNSET = 0; -constexpr uint32_t SET = 1; - -void unsetCriticalLock(omp_lock_t *Lock) { - (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel); -} - -void setCriticalLock(omp_lock_t *Lock) { - uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1; - if (mapping::getThreadIdInWarp() == LowestActiveThread) { - fenceKernel(atomic::release); - while ( - !cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) { - __builtin_amdgcn_s_sleep(32); - } - fenceKernel(atomic::acquire); - } -} - -#endif -///} - -/// NVPTX Implementation -/// -///{ -#ifdef __NVPTX__ - -uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - return __nvvm_atom_inc_gen_ui(Address, Val); -} - -void namedBarrierInit() {} - -void namedBarrier() { - uint32_t NumThreads = omp_get_num_threads(); - ASSERT(NumThreads % 32 == 0, nullptr); - - // The named barrier for active parallel threads of a team in an L1 parallel - // region to synchronize with each other. - constexpr int BarrierNo = 7; - __nvvm_barrier_sync_cnt(BarrierNo, NumThreads); -} - -void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); } - -void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); } - -void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); } - -void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } - -void syncThreads(atomic::OrderingTy Ordering) { - constexpr int BarrierNo = 8; - __nvvm_barrier_sync(BarrierNo); -} - -void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); } - -constexpr uint32_t OMP_SPIN = 1000; -constexpr uint32_t UNSET = 0; -constexpr uint32_t SET = 1; - -// TODO: This seems to hide a bug in the declare variant handling. If it is -// called before it is defined -// here the overload won't happen. Investigate lalter! -void unsetLock(omp_lock_t *Lock) { - (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst); -} - -int testLock(omp_lock_t *Lock) { - return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst); -} - -void initLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void setLock(omp_lock_t *Lock) { - // TODO: not sure spinning is a good idea here.. - while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, - atomic::seq_cst) != UNSET) { - int32_t start = __nvvm_read_ptx_sreg_clock(); - int32_t now; - for (;;) { - now = __nvvm_read_ptx_sreg_clock(); - int32_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) { - break; - } - } - } // wait for 0 to be the read value -} - -void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } - -#endif -///} - -} // namespace impl - -void synchronize::init(bool IsSPMD) { - if (!IsSPMD) - impl::namedBarrierInit(); -} - -void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } - -void synchronize::threads(atomic::OrderingTy Ordering) { - impl::syncThreads(Ordering); -} - -void synchronize::threadsAligned(atomic::OrderingTy Ordering) { - impl::syncThreadsAligned(Ordering); -} - -void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } - -void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); } - -void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } - -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - return impl::atomicInc(Addr, V, Ordering, MemScope); -} - -void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); } - -void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); } - -extern "C" { -void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} - -void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { - __kmpc_barrier(Loc, TId); - return 0; -} - -void __kmpc_barrier(IdentTy *Loc, int32_t TId) { - if (mapping::isSPMDMode()) - return __kmpc_barrier_simple_spmd(Loc, TId); - - // Generic parallel regions are run with multiple of the warp size or single - // threaded, in the latter case we need to stop here. - if (omp_get_num_threads() == 1) - return __kmpc_flush(Loc); - - impl::namedBarrier(); -} - -[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { - synchronize::threadsAligned(atomic::OrderingTy::seq_cst); -} - -[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc, - int32_t TId) { - synchronize::threads(atomic::OrderingTy::seq_cst); -} - -int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { - return omp_get_thread_num() == 0; -} - -void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) { - return omp_get_thread_num() == Filter; -} - -void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { - return __kmpc_master(Loc, TId); -} - -void __kmpc_end_single(IdentTy *Loc, int32_t TId) { - // The barrier is explicitly called. -} - -void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); } - -uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); } - -void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); } - -void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { - impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); -} - -void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { - impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); -} - -void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } - -void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } - -void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } - -void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } - -int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } - -void ompx_sync_block(int Ordering) { - impl::syncThreadsAligned(atomic::OrderingTy(Ordering)); -} -void ompx_sync_block_acq_rel() { - impl::syncThreadsAligned(atomic::OrderingTy::acq_rel); -} -void ompx_sync_block_divergent(int Ordering) { - impl::syncThreads(atomic::OrderingTy(Ordering)); -} -} // extern "C" diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp deleted file mode 100644 index d0be0ace50df..000000000000 --- a/offload/DeviceRTL/src/Tasking.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Task implementation support. -// -// TODO: We should not allocate and execute the task in two steps. A new API is -// needed for that though. -// -//===----------------------------------------------------------------------===// - -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "State.h" - -using namespace ompx; - -extern "C" { - -TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, - size_t TaskSizeInclPrivateValues, - size_t SharedValuesSize, - TaskFnTy TaskFn) { - auto TaskSizeInclPrivateValuesPadded = - utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *)); - auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize; - TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( - TaskSizeTotal, "explicit task descriptor"); - TaskDescriptor->Payload = - utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded); - TaskDescriptor->TaskFn = TaskFn; - - return TaskDescriptor; -} - -int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); -} - -int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int32_t, - void *, int32_t, void *) { - state::DateEnvironmentRAII DERAII(Loc); - - TaskDescriptor->TaskFn(0, TaskDescriptor); - - memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); - return 0; -} - -void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - state::enterDataEnvironment(Loc); -} - -void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - state::exitDataEnvironment(); - - memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); -} - -void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, - void *) {} - -void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {} - -void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {} - -int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; } - -int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; } - -void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int, - uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, - int32_t, uint64_t, void *) { - // Skip task entirely if empty iteration space. - if (*LowerBound > *UpperBound) - return; - - // The compiler has already stored lb and ub in the TaskDescriptorTy structure - // as we are using a single task to execute the entire loop, we can leave - // the initial task_t untouched - __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); -} - -int omp_in_final(void) { - // treat all tasks as final... Specs may expect runtime to keep - // track more precisely if a task was actively set by users... This - // is not explicitly specified; will treat as if runtime can - // actively decide to put a non-final task into a final one. - return 1; -} - -int omp_get_max_task_priority(void) { return 0; } -} diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp deleted file mode 100644 index 59a2cc3f27ac..000000000000 --- a/offload/DeviceRTL/src/Workshare.cpp +++ /dev/null @@ -1,970 +0,0 @@ -//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of the KMPC interface -// for the loop construct plus other worksharing constructs that use the same -// interface as loops. -// -//===----------------------------------------------------------------------===// - -#include "Workshare.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -// TODO: -struct DynamicScheduleTracker { - int64_t Chunk; - int64_t LoopUpperBound; - int64_t NextLowerBound; - int64_t Stride; - kmp_sched_t ScheduleType; - DynamicScheduleTracker *NextDST; -}; - -#define ASSERT0(...) - -// used by the library for the interface with the app -#define DISPATCH_FINISHED 0 -#define DISPATCH_NOTFINISHED 1 - -// used by dynamic scheduling -#define FINISHED 0 -#define NOT_FINISHED 1 -#define LAST_CHUNK 2 - -// TODO: This variable is a hack inherited from the old runtime. -[[clang::loader_uninitialized]] static Local<uint64_t> Cnt; - -template <typename T, typename ST> struct omptarget_nvptx_LoopSupport { - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling with chunk - - // Generic implementation of OMP loop scheduling with static policy - /*! \brief Calculate initial bounds for static loop and stride - * @param[in] loc location in code of the call (not used here) - * @param[in] global_tid global thread id - * @param[in] schetype type of scheduling (see omptarget-nvptx.h) - * @param[in] plastiter pointer to last iteration - * @param[in,out] pointer to loop lower bound. it will contain value of - * lower bound of first chunk - * @param[in,out] pointer to loop upper bound. It will contain value of - * upper bound of first chunk - * @param[in,out] pointer to loop stride. It will contain value of stride - * between two successive chunks executed by the same thread - * @param[in] loop increment bump - * @param[in] chunk size - */ - - // helper function for static chunk - static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, - T entityId, T numberOfEntities) { - // each thread executes multiple chunks all of the same size, except - // the last one - // distance between two successive chunks - stride = numberOfEntities * chunk; - lb = lb + entityId * chunk; - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - // Say ub' is the beginning of the last chunk. Then who ever has a - // lower bound plus a multiple of the increment equal to ub' is - // the last one. - T beginingLastChunk = inputUb - (inputUb % chunk); - last = ((beginingLastChunk - lb) % stride) == 0; - } - - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling without chunk - - // helper function for static no chunk - static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, - T entityId, T numberOfEntities) { - // No chunk size specified. Each thread or warp gets at most one - // chunk; chunks are all almost of equal size - T loopSize = ub - lb + 1; - - chunk = loopSize / numberOfEntities; - T leftOver = loopSize - chunk * numberOfEntities; - - if (entityId < leftOver) { - chunk++; - lb = lb + entityId * chunk; - } else { - lb = lb + entityId * chunk + leftOver; - } - - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - last = lb <= inputUb && inputUb <= ub; - stride = loopSize; // make sure we only do 1 chunk per warp - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for Static Init - - static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter, - T *plower, T *pupper, ST *pstride, ST chunk, - bool IsSPMDExecutionMode) { - int32_t gtid = omp_get_thread_num(); - int numberOfActiveOMPThreads = omp_get_num_threads(); - - // All warps that are in excess of the maximum requested, do - // not execute the loop - ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, - "current thread is not needed here; error"); - - // copy - int lastiter = 0; - T lb = *plower; - T ub = *pupper; - ST stride = *pstride; - - // init - switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { - case kmp_sched_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_static_balanced_chunk: { - if (chunk > 0) { - // round up to make sure the chunk is enough to cover all iterations - T tripCount = ub - lb + 1; // +1 because ub is inclusive - T span = (tripCount + numberOfActiveOMPThreads - 1) / - numberOfActiveOMPThreads; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - if (ub > oldUb) - ub = oldUb; - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - case kmp_sched_distr_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), - omp_get_num_teams()); - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_distr_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), - omp_get_num_teams()); - break; - } - case kmp_sched_distr_static_chunk_sched_static_chunkone: { - ForStaticChunk(lastiter, lb, ub, stride, chunk, - numberOfActiveOMPThreads * omp_get_team_num() + gtid, - omp_get_num_teams() * numberOfActiveOMPThreads); - break; - } - default: { - // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - } - // copy back - *plastiter = lastiter; - *plower = lb; - *pupper = ub; - *pstride = stride; - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch Init - - static int OrderedSchedule(kmp_sched_t schedule) { - return schedule >= kmp_sched_ordered_first && - schedule <= kmp_sched_ordered_last; - } - - static void dispatch_init(IdentTy *loc, int32_t threadId, - kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, - DynamicScheduleTracker *DST) { - int tid = mapping::getThreadIdInBlock(); - T tnum = omp_get_num_threads(); - T tripCount = ub - lb + 1; // +1 because ub is inclusive - ASSERT0(LT_FUSSY, threadId < tnum, - "current thread is not needed here; error"); - - /* Currently just ignore the monotonic and non-monotonic modifiers - * (the compiler isn't producing them * yet anyway). - * When it is we'll want to look at them somewhere here and use that - * information to add to our schedule choice. We shouldn't need to pass - * them on, they merely affect which schedule we can legally choose for - * various dynamic cases. (In particular, whether or not a stealing scheme - * is legal). - */ - schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); - - // Process schedule. - if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { - if (OrderedSchedule(schedule)) - __kmpc_barrier(loc, threadId); - schedule = kmp_sched_static_chunk; - chunk = tripCount; // one thread gets the whole loop - } else if (schedule == kmp_sched_runtime) { - // process runtime - omp_sched_t rtSched; - int ChunkInt; - omp_get_schedule(&rtSched, &ChunkInt); - chunk = ChunkInt; - switch (rtSched) { - case omp_sched_static: { - if (chunk > 0) - schedule = kmp_sched_static_chunk; - else - schedule = kmp_sched_static_nochunk; - break; - } - case omp_sched_auto: { - schedule = kmp_sched_static_chunk; - chunk = 1; - break; - } - case omp_sched_dynamic: - case omp_sched_guided: { - schedule = kmp_sched_dynamic; - break; - } - } - } else if (schedule == kmp_sched_auto) { - schedule = kmp_sched_static_chunk; - chunk = 1; - } else { - // ASSERT(LT_FUSSY, - // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - // "unknown schedule %d & chunk %lld\n", (int)schedule, - // (long long)chunk); - } - - // init schedules - if (schedule == kmp_sched_static_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_static_balanced_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - // round up to make sure the chunk is enough to cover all iterations - T span = (tripCount + tnum - 1) / tnum; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - if (ub > oldUb) - ub = oldUb; - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_static_nochunk) { - ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { - // save data - DST->ScheduleType = schedule; - if (chunk < 1) - chunk = 1; - DST->Chunk = chunk; - DST->LoopUpperBound = ub; - DST->NextLowerBound = lb; - __kmpc_barrier(loc, threadId); - if (tid == 0) { - Cnt = 0; - fence::team(atomic::seq_cst); - } - __kmpc_barrier(loc, threadId); - } - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch next - - static uint64_t NextIter() { - __kmpc_impl_lanemask_t active = mapping::activemask(); - uint32_t leader = utils::ffs(active) - 1; - uint32_t change = utils::popc(active); - __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT(); - unsigned int rank = utils::popc(active & lane_mask_lt); - uint64_t warp_res = 0; - if (rank == 0) { - warp_res = atomic::add(&Cnt, change, atomic::seq_cst); - } - warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize()); - return warp_res + rank; - } - - static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, - T loopUpperBound) { - T N = NextIter(); - lb = loopLowerBound + N * chunkSize; - ub = lb + chunkSize - 1; // Clang uses i <= ub - - // 3 result cases: - // a. lb and ub < loopUpperBound --> NOT_FINISHED - // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> - // NOT_FINISHED - // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED - // a. - if (lb <= loopUpperBound && ub < loopUpperBound) { - return NOT_FINISHED; - } - // b. - if (lb <= loopUpperBound) { - ub = loopUpperBound; - return LAST_CHUNK; - } - // c. if we are here, we are in case 'c' - lb = loopUpperBound + 2; - ub = loopUpperBound + 1; - return FINISHED; - } - - static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, - T *plower, T *pupper, ST *pstride, - DynamicScheduleTracker *DST) { - // ID of a thread in its own warp - - // automatically selects thread or warp ID based on selected implementation - ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), - "current thread is not needed here; error"); - // retrieve schedule - kmp_sched_t schedule = DST->ScheduleType; - - // xxx reduce to one - if (schedule == kmp_sched_static_chunk || - schedule == kmp_sched_static_nochunk) { - T myLb = DST->NextLowerBound; - T ub = DST->LoopUpperBound; - // finished? - if (myLb > ub) { - return DISPATCH_FINISHED; - } - // not finished, save current bounds - ST chunk = DST->Chunk; - *plower = myLb; - T myUb = myLb + chunk - 1; // Clang uses i <= ub - if (myUb > ub) - myUb = ub; - *pupper = myUb; - *plast = (int32_t)(myUb == ub); - - // increment next lower bound by the stride - ST stride = DST->Stride; - DST->NextLowerBound = myLb + stride; - return DISPATCH_NOTFINISHED; - } - ASSERT0(LT_FUSSY, - schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - "bad sched"); - T myLb, myUb; - int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound, - DST->LoopUpperBound); - - if (finished == FINISHED) - return DISPATCH_FINISHED; - - // not finished (either not finished or last chunk) - *plast = (int32_t)(finished == LAST_CHUNK); - *plower = myLb; - *pupper = myUb; - *pstride = 1; - - return DISPATCH_NOTFINISHED; - } - - static void dispatch_fini() { - // nothing - } - - //////////////////////////////////////////////////////////////////////////////// - // end of template class that encapsulate all the helper functions - //////////////////////////////////////////////////////////////////////////////// -}; - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (dyn loops) -//////////////////////////////////////////////////////////////////////////////// - -// TODO: Expand the dispatch API to take a DST pointer which can then be -// allocated properly without malloc. -// For now, each team will contain an LDS pointer (ThreadDST) to a global array -// of references to the DST structs allocated (in global memory) for each thread -// in the team. The global memory array is allocated during the init phase if it -// was not allocated already and will be deallocated when the dispatch phase -// ends: -// -// __kmpc_dispatch_init -// -// ** Dispatch loop ** -// -// __kmpc_dispatch_deinit -// -[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **> - ThreadDST; - -// Create a new DST, link the current one, and define the new as current. -static DynamicScheduleTracker *pushDST() { - int32_t ThreadIndex = mapping::getThreadIdInBlock(); - // Each block will allocate an array of pointers to DST structs. The array is - // equal in length to the number of threads in that block. - if (!ThreadDST) { - // Allocate global memory array of pointers to DST structs: - if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0) - ThreadDST = static_cast<DynamicScheduleTracker **>( - memory::allocGlobal(mapping::getNumberOfThreadsInBlock() * - sizeof(DynamicScheduleTracker *), - "new ThreadDST array")); - synchronize::threads(atomic::seq_cst); - - // Initialize the array pointers: - ThreadDST[ThreadIndex] = nullptr; - } - - // Create a DST struct for the current thread: - DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>( - memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST")); - *NewDST = DynamicScheduleTracker({0}); - - // Add the new DST struct to the array of DST structs: - NewDST->NextDST = ThreadDST[ThreadIndex]; - ThreadDST[ThreadIndex] = NewDST; - return NewDST; -} - -// Return the current DST. -static DynamicScheduleTracker *peekDST() { - return ThreadDST[mapping::getThreadIdInBlock()]; -} - -// Pop the current DST and restore the last one. -static void popDST() { - int32_t ThreadIndex = mapping::getThreadIdInBlock(); - DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex]; - DynamicScheduleTracker *OldDST = CurrentDST->NextDST; - memory::freeGlobal(CurrentDST, "remove DST"); - ThreadDST[ThreadIndex] = OldDST; - - // Check if we need to deallocate the global array. Ensure all threads - // in the block have finished deallocating the individual DSTs. - synchronize::threads(atomic::seq_cst); - if (!ThreadDST[ThreadIndex] && !ThreadIndex) { - memory::freeGlobal(ThreadDST, "remove ThreadDST array"); - ThreadDST = nullptr; - } - synchronize::threads(atomic::seq_cst); -} - -void workshare::init(bool IsSPMD) { - if (mapping::isInitialThreadInLevel0(IsSPMD)) - ThreadDST = nullptr; -} - -extern "C" { - -// init -void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, - int32_t lb, int32_t ub, int32_t st, int32_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, - uint32_t lb, uint32_t ub, int32_t st, - int32_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, - int64_t lb, int64_t ub, int64_t st, int64_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, - uint64_t lb, uint64_t ub, int64_t st, - int64_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -// next -int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, - int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, - int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -// fini -void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini(); -} - -// deinit -void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (static loops) -//////////////////////////////////////////////////////////////////////////////// - -void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} - -void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} -} - -namespace ompx { - -/// Helper class to hide the generic loop nest and provide the template argument -/// throughout. -template <typename Ty> class StaticLoopChunker { - - /// Generic loop nest that handles block and/or thread distribution in the - /// absence of user specified chunk sizes. This implicitly picks a block chunk - /// size equal to the number of threads in the block and a thread chunk size - /// equal to one. In contrast to the chunked version we can get away with a - /// single loop in this case - static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, - Ty NumBlocks, Ty BId, Ty NumThreads, - Ty TId, Ty NumIters, - uint8_t OneIterationPerThread) { - Ty KernelIteration = NumBlocks * NumThreads; - - // Start index in the normalized space. - Ty IV = BId * NumThreads + TId; - ASSERT(IV >= 0, "Bad index"); - - // Cover the entire iteration space, assumptions in the caller might allow - // to simplify this loop to a conditional. - if (IV < NumIters) { - do { - - // Execute the loop body. - LoopBody(IV, Arg); - - // Every thread executed one block and thread chunk now. - IV += KernelIteration; - - if (OneIterationPerThread) - return; - - } while (IV < NumIters); - } - } - - /// Generic loop nest that handles block and/or thread distribution in the - /// presence of user specified chunk sizes (for at least one of them). - static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg, - Ty BlockChunk, Ty NumBlocks, Ty BId, - Ty ThreadChunk, Ty NumThreads, Ty TId, - Ty NumIters, - uint8_t OneIterationPerThread) { - Ty KernelIteration = NumBlocks * BlockChunk; - - // Start index in the chunked space. - Ty IV = BId * BlockChunk + TId; - ASSERT(IV >= 0, "Bad index"); - - // Cover the entire iteration space, assumptions in the caller might allow - // to simplify this loop to a conditional. - do { - - Ty BlockChunkLeft = - BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0; - Ty ThreadChunkLeft = - ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft; - - while (ThreadChunkLeft--) { - - // Given the blocking it's hard to keep track of what to execute. - if (IV >= NumIters) - return; - - // Execute the loop body. - LoopBody(IV, Arg); - - if (OneIterationPerThread) - return; - - ++IV; - } - - IV += KernelIteration; - - } while (IV < NumIters); - } - -public: - /// Worksharing `for`-loop. - /// \param[in] Loc Description of source location - /// \param[in] LoopBody Function which corresponds to loop body - /// \param[in] Arg Pointer to struct which contains loop body args - /// \param[in] NumIters Number of loop iterations - /// \param[in] NumThreads Number of GPU threads - /// \param[in] ThreadChunk Size of thread chunk - /// \param[in] OneIterationPerThread If true/nonzero, each thread executes - /// only one loop iteration or one thread chunk. This avoids an outer loop - /// over all loop iterations/chunks. - static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty NumThreads, Ty ThreadChunk, - uint8_t OneIterationPerThread) { - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(ThreadChunk >= 0, "Bad thread count"); - - // All threads need to participate but we don't know if we are in a - // parallel at all or if the user might have used a `num_threads` clause - // on the parallel and reduced the number compared to the block size. - // Since nested parallels are possible too we need to get the thread id - // from the `omp` getter and not the mapping directly. - Ty TId = omp_get_thread_num(); - - // There are no blocks involved here. - Ty BlockChunk = 0; - Ty NumBlocks = 1; - Ty BId = 0; - - // If the thread chunk is not specified we pick a default now. - if (ThreadChunk == 0) - ThreadChunk = 1; - - // If we know we have more threads than iterations we can indicate that to - // avoid an outer loop. - if (config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - - if (OneIterationPerThread) - ASSERT(NumThreads >= NumIters, "Broken assumption"); - - if (ThreadChunk != 1) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - } - - /// Worksharing `distribute`-loop. - /// \param[in] Loc Description of source location - /// \param[in] LoopBody Function which corresponds to loop body - /// \param[in] Arg Pointer to struct which contains loop body args - /// \param[in] NumIters Number of loop iterations - /// \param[in] BlockChunk Size of block chunk - /// \param[in] OneIterationPerThread If true/nonzero, each thread executes - /// only one loop iteration or one thread chunk. This avoids an outer loop - /// over all loop iterations/chunks. - static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty BlockChunk, - uint8_t OneIterationPerThread) { - ASSERT(icv::Level == 0, "Bad distribute"); - ASSERT(icv::ActiveLevel == 0, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); - - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(BlockChunk >= 0, "Bad block count"); - - // There are no threads involved here. - Ty ThreadChunk = 0; - Ty NumThreads = 1; - Ty TId = 0; - - // All teams need to participate. - Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); - Ty BId = mapping::getBlockIdInKernel(); - - // If the block chunk is not specified we pick a default now. - if (BlockChunk == 0) - BlockChunk = NumThreads; - - // If we know we have more blocks than iterations we can indicate that to - // avoid an outer loop. - if (config::getAssumeTeamsOversubscription()) { - OneIterationPerThread = true; - } - - if (OneIterationPerThread) - ASSERT(NumBlocks >= NumIters, "Broken assumption"); - - if (BlockChunk != NumThreads) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - - ASSERT(icv::Level == 0, "Bad distribute"); - ASSERT(icv::ActiveLevel == 0, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); - } - - /// Worksharing `distribute parallel for`-loop. - /// \param[in] Loc Description of source location - /// \param[in] LoopBody Function which corresponds to loop body - /// \param[in] Arg Pointer to struct which contains loop body args - /// \param[in] NumIters Number of loop iterations - /// \param[in] NumThreads Number of GPU threads - /// \param[in] BlockChunk Size of block chunk - /// \param[in] ThreadChunk Size of thread chunk - /// \param[in] OneIterationPerThread If true/nonzero, each thread executes - /// only one loop iteration or one thread chunk. This avoids an outer loop - /// over all loop iterations/chunks. - static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), - void *Arg, Ty NumIters, Ty NumThreads, - Ty BlockChunk, Ty ThreadChunk, - uint8_t OneIterationPerThread) { - ASSERT(icv::Level == 1, "Bad distribute"); - ASSERT(icv::ActiveLevel == 1, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(BlockChunk >= 0, "Bad block count"); - ASSERT(ThreadChunk >= 0, "Bad thread count"); - - // All threads need to participate but the user might have used a - // `num_threads` clause on the parallel and reduced the number compared to - // the block size. - Ty TId = mapping::getThreadIdInBlock(); - - // All teams need to participate. - Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); - Ty BId = mapping::getBlockIdInKernel(); - - // If the block chunk is not specified we pick a default now. - if (BlockChunk == 0) - BlockChunk = NumThreads; - - // If the thread chunk is not specified we pick a default now. - if (ThreadChunk == 0) - ThreadChunk = 1; - - // If we know we have more threads (across all blocks) than iterations we - // can indicate that to avoid an outer loop. - if (config::getAssumeTeamsOversubscription() & - config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - - if (OneIterationPerThread) - ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); - - if (BlockChunk != NumThreads || ThreadChunk != 1) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - - ASSERT(icv::Level == 1, "Bad distribute"); - ASSERT(icv::ActiveLevel == 1, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - } -}; - -} // namespace ompx - -#define OMP_LOOP_ENTRY(BW, TY) \ - [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY block_chunk, TY thread_chunk, \ - uint8_t one_iteration_per_thread) { \ - ompx::StaticLoopChunker<TY>::DistributeFor( \ - loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \ - one_iteration_per_thread); \ - } \ - [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ - void *arg, TY num_iters, TY block_chunk, \ - uint8_t one_iteration_per_thread) { \ - ompx::StaticLoopChunker<TY>::Distribute( \ - loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \ - } \ - [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) { \ - ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \ - thread_chunk, one_iteration_per_thread); \ - } - -extern "C" { -OMP_LOOP_ENTRY(_4, int32_t) -OMP_LOOP_ENTRY(_4u, uint32_t) -OMP_LOOP_ENTRY(_8, int64_t) -OMP_LOOP_ENTRY(_8u, uint64_t) -} |
