diff options
| author | Guillaume Chatelet <gchatelet@google.com> | 2025-10-14 09:02:30 -0700 |
|---|---|---|
| committer | Alex Richardson <alexrichardson@google.com> | 2025-10-14 09:02:30 -0700 |
| commit | e2d7be24a8dc31bb36380abd088b7eb0da7ef6b4 (patch) | |
| tree | 4811d025c12321c442695ad5aa4f511fa2fbd10b /offload | |
| parent | 1be5a8430be58baa5754e6f046eeacf7ca2f1a54 (diff) | |
| parent | 57726bdca274b152d2f36aaad7c961767bb1f91a (diff) | |
[𝘀𝗽𝗿] changes introduced through rebaseusers/arichardson/spr/main.amdgpu-baseline-test-for-ptrtoaddr-code-generation
Created using spr 1.3.8-beta.1
[skip ci]
Diffstat (limited to 'offload')
216 files changed, 10026 insertions, 7086 deletions
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt index 38fa77e41bb5..b27738078350 100644 --- a/offload/CMakeLists.txt +++ b/offload/CMakeLists.txt @@ -4,7 +4,8 @@ cmake_minimum_required(VERSION 3.20.0) set(LLVM_SUBPROJECT_TITLE "liboffload") -if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") +# Permit redefining OPENMP_STANDALONE_BUILD when doing a runtimes build. +if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") set(OPENMP_STANDALONE_BUILD TRUE) project(offload C CXX ASM) else() @@ -371,7 +372,6 @@ add_subdirectory(tools/offload-tblgen) # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins-nextgen) -add_subdirectory(DeviceRTL) add_subdirectory(tools) add_subdirectory(docs) diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt deleted file mode 100644 index e4916f4d4975..000000000000 --- a/offload/DeviceRTL/CMakeLists.txt +++ /dev/null @@ -1,188 +0,0 @@ -set(LIBOMPTARGET_BUILD_DEVICERTL_BCLIB TRUE CACHE BOOL - "Can be set to false to disable building this library.") - -if (NOT LIBOMPTARGET_BUILD_DEVICERTL_BCLIB) - message(STATUS "Not building DeviceRTL: Disabled by LIBOMPTARGET_BUILD_DEVICERTL_BCLIB") - return() -endif() - -# Check to ensure the host system is a supported host architecture. -if(NOT ${CMAKE_SIZEOF_VOID_P} EQUAL "8") - message(STATUS "Not building DeviceRTL: Runtime does not support 32-bit hosts") - return() -endif() - -if (LLVM_DIR) - # Builds that use pre-installed LLVM have LLVM_DIR set. - # A standalone or LLVM_ENABLE_RUNTIMES=openmp build takes this route - find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) -elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD) - # LLVM in-tree builds may use CMake target names to discover the tools. - # A LLVM_ENABLE_PROJECTS=openmp build takes this route - set(CLANG_TOOL $<TARGET_FILE:clang>) -else() - message(STATUS "Not building DeviceRTL. No appropriate clang found") - return() -endif() - -set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR}) -set(include_directory ${devicertl_base_directory}/include) -set(source_directory ${devicertl_base_directory}/src) - -set(include_files - ${include_directory}/Allocator.h - ${include_directory}/Configuration.h - ${include_directory}/Debug.h - ${include_directory}/Interface.h - ${include_directory}/LibC.h - ${include_directory}/Mapping.h - ${include_directory}/Profiling.h - ${include_directory}/State.h - ${include_directory}/Synchronization.h - ${include_directory}/DeviceTypes.h - ${include_directory}/DeviceUtils.h - ${include_directory}/Workshare.h -) - -set(src_files - ${source_directory}/Allocator.cpp - ${source_directory}/Configuration.cpp - ${source_directory}/Debug.cpp - ${source_directory}/Kernel.cpp - ${source_directory}/LibC.cpp - ${source_directory}/Mapping.cpp - ${source_directory}/Misc.cpp - ${source_directory}/Parallelism.cpp - ${source_directory}/Profiling.cpp - ${source_directory}/Reduction.cpp - ${source_directory}/State.cpp - ${source_directory}/Synchronization.cpp - ${source_directory}/Tasking.cpp - ${source_directory}/DeviceUtils.cpp - ${source_directory}/Workshare.cpp -) - -# We disable the slp vectorizer during the runtime optimization to avoid -# vectorized accesses to the shared state. Generally, those are "good" but -# the optimizer pipeline (esp. Attributor) does not fully support vectorized -# instructions yet and we end up missing out on way more important constant -# propagation. That said, we will run the vectorizer again after the runtime -# has been linked into the user program. -set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false ) - -# If the user built with the GPU C library enabled we will use that instead. -if(${LIBOMPTARGET_GPU_LIBC_SUPPORT}) - list(APPEND clang_opt_flags -DOMPTARGET_HAS_LIBC) -endif() - -# Set flags for LLVM Bitcode compilation. -set(bc_flags -c -flto -std=c++17 -fvisibility=hidden - ${clang_opt_flags} -nogpulib -nostdlibinc - -fno-rtti -fno-exceptions -fconvergent-functions - -Wno-unknown-cuda-version - -DOMPTARGET_DEVICE_RUNTIME - -I${include_directory} - -I${devicertl_base_directory}/../include - -I${devicertl_base_directory}/../../libc -) - -# first create an object target -function(compileDeviceRTLLibrary target_name target_triple) - set(target_bc_flags ${ARGN}) - - foreach(src ${src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - set(outfile "${outfile}-${target_name}.o") - set(depfile "${outfile}.d") - - # Passing an empty CPU to -march= suppressed target specific metadata. - add_custom_command(OUTPUT ${outfile} - COMMAND ${CLANG_TOOL} - ${bc_flags} - --target=${target_triple} - ${target_bc_flags} - -MD -MF ${depfile} - ${infile} -o ${outfile} - DEPENDS ${infile} - DEPFILE ${depfile} - COMMENT "Building LLVM bitcode ${outfile}" - VERBATIM - ) - if(TARGET clang) - # Add a file-level dependency to ensure that clang is up-to-date. - # By default, add_custom_command only builds clang if the - # executable is missing. - add_custom_command(OUTPUT ${outfile} - DEPENDS clang - APPEND - ) - endif() - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) - - list(APPEND obj_files ${CMAKE_CURRENT_BINARY_DIR}/${outfile}) - endforeach() - # Trick to combine these into a bitcode file via the linker's LTO pass. This - # is used to provide the legacy `libomptarget-<name>.bc` files. Hack this - # through as an executable to get it to use the relocatable link. - add_executable(libomptarget-${target_name} ${obj_files}) - set_target_properties(libomptarget-${target_name} PROPERTIES - RUNTIME_OUTPUT_DIRECTORY ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR} - LINKER_LANGUAGE CXX - BUILD_RPATH "" - INSTALL_RPATH "" - RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc) - target_compile_options(libomptarget-${target_name} PRIVATE - "--target=${target_triple}" "-fuse-ld=lld" "-march=" "-mcpu=" - "-Wno-unused-command-line-argument") - target_link_options(libomptarget-${target_name} PRIVATE "--target=${target_triple}" - "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm" - "-fuse-ld=lld" "-march=" "-mcpu=") - install(TARGETS libomptarget-${target_name} - PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ - DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}") - - add_library(omptarget.${target_name}.all_objs OBJECT IMPORTED) - set_property(TARGET omptarget.${target_name}.all_objs APPEND PROPERTY IMPORTED_OBJECTS - ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/libomptarget-${target_name}.bc) - add_dependencies(omptarget.${target_name}.all_objs libomptarget-${target_name}) - - # Archive all the object files generated above into a static library - add_library(omptarget.${target_name} STATIC) - set_target_properties(omptarget.${target_name} PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/${target_triple}" - ARCHIVE_OUTPUT_NAME ompdevice - LINKER_LANGUAGE CXX - ) - target_link_libraries(omptarget.${target_name} PRIVATE omptarget.${target_name}.all_objs) - target_link_options(omptarget.${target_name} PRIVATE "--target=${target_triple}" - "-Wno-unused-command-line-argument" "-r" "-nostdlib" "-flto" - "-Wl,--lto-emit-llvm" "-fuse-ld=lld" "-march=" "-mcpu=") - - install(TARGETS omptarget.${target_name} - ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}") - - if (CMAKE_EXPORT_COMPILE_COMMANDS) - set(ide_target_name omptarget-ide-${target_name}) - add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files}) - target_compile_options(${ide_target_name} PRIVATE - -fvisibility=hidden --target=${target_triple} - -nogpulib -nostdlibinc -Wno-unknown-cuda-version - ) - target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512) - target_include_directories(${ide_target_name} PRIVATE - ${include_directory} - ${devicertl_base_directory}/../../libc - ${devicertl_base_directory}/../include - ) - install(TARGETS ${ide_target_name} EXCLUDE_FROM_ALL) - endif() -endfunction() - -if(NOT LLVM_TARGETS_TO_BUILD OR "AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD) - compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) -endif() - -if(NOT LLVM_TARGETS_TO_BUILD OR "NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) - compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63) -endif() diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h deleted file mode 100644 index dc4d029ed75f..000000000000 --- a/offload/DeviceRTL/include/Allocator.h +++ /dev/null @@ -1,45 +0,0 @@ -//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_ALLOCATOR_H -#define OMPTARGET_ALLOCATOR_H - -#include "DeviceTypes.h" - -// Forward declaration. -struct KernelEnvironmentTy; - -namespace ompx { - -namespace allocator { - -static uint64_t constexpr ALIGNMENT = 16; - -/// Initialize the allocator according to \p KernelEnvironment -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment); - -/// Allocate \p Size bytes. -[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void * -alloc(uint64_t Size); - -/// Free the allocation pointed to by \p Ptr. -void free(void *Ptr); - -} // namespace allocator - -} // namespace ompx - -extern "C" { -void *malloc(size_t Size); -void free(void *Ptr); -} - -#endif diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h deleted file mode 100644 index 95408933dd86..000000000000 --- a/offload/DeviceRTL/include/Configuration.h +++ /dev/null @@ -1,68 +0,0 @@ -//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// API to query the global (constant) device environment. -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_CONFIGURATION_H -#define OMPTARGET_CONFIGURATION_H - -#include "Shared/Environment.h" - -#include "DeviceTypes.h" - -namespace ompx { -namespace config { - -/// Return the number of devices in the system, same number as returned on the -/// host by omp_get_num_devices. -uint32_t getNumDevices(); - -/// Return the device number in the system for omp_get_device_num. -uint32_t getDeviceNum(); - -/// Return the user chosen debug level. -uint32_t getDebugKind(); - -/// Return if teams oversubscription is assumed -uint32_t getAssumeTeamsOversubscription(); - -/// Return if threads oversubscription is assumed -uint32_t getAssumeThreadsOversubscription(); - -/// Return the amount of dynamic shared memory that was allocated at launch. -uint64_t getDynamicMemorySize(); - -/// Returns the cycles per second of the device's fixed frequency clock. -uint64_t getClockFrequency(); - -/// Returns the pointer to the beginning of the indirect call table. -void *getIndirectCallTablePtr(); - -/// Returns the size of the indirect call table. -uint64_t getIndirectCallTableSize(); - -/// Returns the size of the indirect call table. -uint64_t getHardwareParallelism(); - -/// Return if debugging is enabled for the given debug kind. -bool isDebugMode(DeviceDebugKind Level); - -/// Indicates if this kernel may require thread-specific states, or if it was -/// explicitly disabled by the user. -bool mayUseThreadStates(); - -/// Indicates if this kernel may require data environments for nested -/// parallelism, or if it was explicitly disabled by the user. -bool mayUseNestedParallelism(); - -} // namespace config -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Debug.h b/offload/DeviceRTL/include/Debug.h deleted file mode 100644 index 98d0fa498d95..000000000000 --- a/offload/DeviceRTL/include/Debug.h +++ /dev/null @@ -1,44 +0,0 @@ -//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_DEBUG_H -#define OMPTARGET_DEVICERTL_DEBUG_H - -#include "Configuration.h" -#include "LibC.h" - -/// Assertion -/// -/// { -extern "C" { -void __assert_assume(bool condition); -void __assert_fail(const char *expr, const char *file, unsigned line, - const char *function); -void __assert_fail_internal(const char *expr, const char *msg, const char *file, - unsigned line, const char *function); -} - -#define ASSERT(expr, msg) \ - { \ - if (config::isDebugMode(DeviceDebugKind::Assertion) && !(expr)) \ - __assert_fail_internal(#expr, msg, __FILE__, __LINE__, \ - __PRETTY_FUNCTION__); \ - else \ - __assert_assume(expr); \ - } -#define UNREACHABLE(msg) \ - printf(msg); \ - __builtin_trap(); \ - __builtin_unreachable(); - -///} - -#endif diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h deleted file mode 100644 index 2e5d92380f04..000000000000 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ /dev/null @@ -1,166 +0,0 @@ -//===---------- DeviceTypes.h - OpenMP types ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_TYPES_H -#define OMPTARGET_TYPES_H - -#include <gpuintrin.h> -#include <stddef.h> -#include <stdint.h> - -template <typename T> using Private = __gpu_private T; -template <typename T> using Constant = __gpu_constant T; -template <typename T> using Local = __gpu_local T; -template <typename T> using Global = __gpu_local T; - -enum omp_proc_bind_t { - omp_proc_bind_false = 0, - omp_proc_bind_true = 1, - omp_proc_bind_master = 2, - omp_proc_bind_close = 3, - omp_proc_bind_spread = 4 -}; - -enum omp_sched_t { - omp_sched_static = 1, /* chunkSize >0 */ - omp_sched_dynamic = 2, /* chunkSize >0 */ - omp_sched_guided = 3, /* chunkSize >0 */ - omp_sched_auto = 4, /* no chunkSize */ -}; - -enum kmp_sched_t { - kmp_sched_static_chunk = 33, - kmp_sched_static_nochunk = 34, - kmp_sched_dynamic = 35, - kmp_sched_guided = 36, - kmp_sched_runtime = 37, - kmp_sched_auto = 38, - - kmp_sched_static_balanced_chunk = 45, - - kmp_sched_static_ordered = 65, - kmp_sched_static_nochunk_ordered = 66, - kmp_sched_dynamic_ordered = 67, - kmp_sched_guided_ordered = 68, - kmp_sched_runtime_ordered = 69, - kmp_sched_auto_ordered = 70, - - kmp_sched_distr_static_chunk = 91, - kmp_sched_distr_static_nochunk = 92, - kmp_sched_distr_static_chunk_sched_static_chunkone = 93, - - kmp_sched_default = kmp_sched_static_nochunk, - kmp_sched_unordered_first = kmp_sched_static_chunk, - kmp_sched_unordered_last = kmp_sched_auto, - kmp_sched_ordered_first = kmp_sched_static_ordered, - kmp_sched_ordered_last = kmp_sched_auto_ordered, - kmp_sched_distribute_first = kmp_sched_distr_static_chunk, - kmp_sched_distribute_last = - kmp_sched_distr_static_chunk_sched_static_chunkone, - - /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. - * Since we need to distinguish the three possible cases (no modifier, - * monotonic modifier, nonmonotonic modifier), we need separate bits for - * each modifier. The absence of monotonic does not imply nonmonotonic, - * especially since 4.5 says that the behaviour of the "no modifier" case - * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. - * - * Since we're passing a full 32 bit value, we can use a couple of high - * bits for these flags; out of paranoia we avoid the sign bit. - * - * These modifiers can be or-ed into non-static schedules by the compiler - * to pass the additional information. They will be stripped early in the - * processing in __kmp_dispatch_init when setting up schedules, so - * most of the code won't ever see schedules with these bits set. - */ - kmp_sched_modifier_monotonic = (1 << 29), - /**< Set if the monotonic schedule modifier was present */ - kmp_sched_modifier_nonmonotonic = (1 << 30), -/**< Set if the nonmonotonic schedule modifier was present */ - -#define SCHEDULE_WITHOUT_MODIFIERS(s) \ - (enum kmp_sched_t)( \ - (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) -#define SCHEDULE_HAS_MONOTONIC(s) (((s) & kmp_sched_modifier_monotonic) != 0) -#define SCHEDULE_HAS_NONMONOTONIC(s) \ - (((s) & kmp_sched_modifier_nonmonotonic) != 0) -#define SCHEDULE_HAS_NO_MODIFIERS(s) \ - (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ - 0) - -}; - -struct TaskDescriptorTy; -using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr); -struct TaskDescriptorTy { - void *Payload; - TaskFnTy TaskFn; -}; - -using LaneMaskTy = uint64_t; - -namespace lanes { -enum : LaneMaskTy { All = ~(LaneMaskTy)0 }; -} // namespace lanes - -/// The ident structure that describes a source location. The struct is -/// identical to the one in the kmp.h file. We maintain the same data structure -/// for compatibility. -struct IdentTy { - int32_t reserved_1; /**< might be used in Fortran; see above */ - int32_t flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC - identifies this union member */ - int32_t reserved_2; /**< not really used in Fortran any more; see above */ - int32_t reserved_3; /**< source[4] in Fortran, do not use for C++ */ - char const *psource; /**< String describing the source location. - The string is composed of semi-colon separated fields - which describe the source file, the function and a pair - of line numbers that delimit the construct. */ -}; - -using __kmpc_impl_lanemask_t = LaneMaskTy; - -using ParallelRegionFnTy = void *; - -using CriticalNameTy = int32_t[8]; - -struct omp_lock_t { - void *Lock; -}; - -using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num); -using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id, - int16_t lane_offset, int16_t shortCircuit); -using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data); - -/// Macros for allocating variables in different address spaces. -///{ - -// Follows the pattern in interface.h -typedef enum omp_allocator_handle_t { - omp_null_allocator = 0, - omp_default_mem_alloc = 1, - omp_large_cap_mem_alloc = 2, - omp_const_mem_alloc = 3, - omp_high_bw_mem_alloc = 4, - omp_low_lat_mem_alloc = 5, - omp_cgroup_mem_alloc = 6, - omp_pteam_mem_alloc = 7, - omp_thread_mem_alloc = 8, - KMP_ALLOCATOR_MAX_HANDLE = ~(0LU) -} omp_allocator_handle_t; - -#define __PRAGMA(STR) _Pragma(#STR) -#define OMP_PRAGMA(STR) __PRAGMA(omp STR) - -///} - -#endif diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h deleted file mode 100644 index b92514ee9838..000000000000 --- a/offload/DeviceRTL/include/DeviceUtils.h +++ /dev/null @@ -1,96 +0,0 @@ -//===--- DeviceUtils.h - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H -#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H - -#include "DeviceTypes.h" -#include "Shared/Utils.h" - -namespace utils { - -template <typename T> struct type_identity { - using type = T; -}; - -template <typename T, T v> struct integral_constant { - inline static constexpr T value = v; -}; - -/// Freestanding SFINAE helpers. -template <class T> struct remove_cv : type_identity<T> {}; -template <class T> struct remove_cv<const T> : type_identity<T> {}; -template <class T> struct remove_cv<volatile T> : type_identity<T> {}; -template <class T> struct remove_cv<const volatile T> : type_identity<T> {}; -template <class T> using remove_cv_t = typename remove_cv<T>::type; - -using true_type = integral_constant<bool, true>; -using false_type = integral_constant<bool, false>; - -template <typename T, typename U> struct is_same : false_type {}; -template <typename T> struct is_same<T, T> : true_type {}; -template <typename T, typename U> -inline constexpr bool is_same_v = is_same<T, U>::value; - -template <typename T> struct is_floating_point { - inline static constexpr bool value = - is_same_v<remove_cv_t<T>, float> || is_same_v<remove_cv_t<T>, double>; -}; -template <typename T> -inline constexpr bool is_floating_point_v = is_floating_point<T>::value; - -template <bool B, typename T = void> struct enable_if; -template <typename T> struct enable_if<true, T> : type_identity<T> {}; -template <bool B, typename T = void> -using enable_if_t = typename enable_if<B, T>::type; - -template <class T> struct remove_addrspace : type_identity<T> {}; -template <class T, int N> -struct remove_addrspace<T [[clang::address_space(N)]]> : type_identity<T> {}; -template <class T> -using remove_addrspace_t = typename remove_addrspace<T>::type; - -template <typename To, typename From> inline To bitCast(From V) { - static_assert(sizeof(To) == sizeof(From), "Bad conversion"); - return __builtin_bit_cast(To, V); -} - -/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread -/// is identified by \p Mask. -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); - -int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); - -uint64_t ballotSync(uint64_t Mask, int32_t Pred); - -/// Return \p LowBits and \p HighBits packed into a single 64 bit value. -uint64_t pack(uint32_t LowBits, uint32_t HighBits); - -/// Unpack \p Val into \p LowBits and \p HighBits. -void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); - -/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). -bool isSharedMemPtr(void *Ptr); - -/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)). -bool isThreadLocalMemPtr(void *Ptr); - -/// A pointer variable that has by design an `undef` value. Use with care. -[[clang::loader_uninitialized]] static void *const UndefPtr; - -#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) -#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) - -} // namespace utils - -#endif diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h deleted file mode 100644 index c4bfaaa2404b..000000000000 --- a/offload/DeviceRTL/include/Interface.h +++ /dev/null @@ -1,366 +0,0 @@ -//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_INTERFACE_H -#define OMPTARGET_DEVICERTL_INTERFACE_H - -#include "Shared/Environment.h" - -#include "DeviceTypes.h" - -/// External API -/// -///{ - -extern "C" { - -/// ICV: dyn-var, constant 0 -/// -/// setter: ignored. -/// getter: returns 0. -/// -///{ -void omp_set_dynamic(int); -int omp_get_dynamic(void); -///} - -/// ICV: nthreads-var, integer -/// -/// scope: data environment -/// -/// setter: ignored. -/// getter: returns false. -/// -/// implementation notes: -/// -/// -///{ -void omp_set_num_threads(int); -int omp_get_max_threads(void); -///} - -/// ICV: thread-limit-var, computed -/// -/// getter: returns thread limited defined during launch. -/// -///{ -int omp_get_thread_limit(void); -///} - -/// ICV: max-active-level-var, constant 1 -/// -/// setter: ignored. -/// getter: returns 1. -/// -///{ -void omp_set_max_active_levels(int); -int omp_get_max_active_levels(void); -///} - -/// ICV: places-partition-var -/// -/// -///{ -///} - -/// ICV: active-level-var, 0 or 1 -/// -/// getter: returns 0 or 1. -/// -///{ -int omp_get_active_level(void); -///} - -/// ICV: level-var -/// -/// getter: returns parallel region nesting -/// -///{ -int omp_get_level(void); -///} - -/// ICV: run-sched-var -/// -/// -///{ -void omp_set_schedule(omp_sched_t, int); -void omp_get_schedule(omp_sched_t *, int *); -///} - -/// TODO this is incomplete. -int omp_get_num_threads(void); -int omp_get_thread_num(void); -void omp_set_nested(int); - -int omp_get_nested(void); - -void omp_set_max_active_levels(int Level); - -int omp_get_max_active_levels(void); - -omp_proc_bind_t omp_get_proc_bind(void); - -int omp_get_num_places(void); - -int omp_get_place_num_procs(int place_num); - -void omp_get_place_proc_ids(int place_num, int *ids); - -int omp_get_place_num(void); - -int omp_get_partition_num_places(void); - -void omp_get_partition_place_nums(int *place_nums); - -int omp_get_cancellation(void); - -void omp_set_default_device(int deviceId); - -int omp_get_default_device(void); - -int omp_get_num_devices(void); - -int omp_get_device_num(void); - -int omp_get_num_teams(void); - -int omp_get_team_num(); - -int omp_get_initial_device(void); - -void *llvm_omp_target_dynamic_shared_alloc(); - -/// Synchronization -/// -///{ -void omp_init_lock(omp_lock_t *Lock); - -void omp_destroy_lock(omp_lock_t *Lock); - -void omp_set_lock(omp_lock_t *Lock); - -void omp_unset_lock(omp_lock_t *Lock); - -int omp_test_lock(omp_lock_t *Lock); -///} - -/// Tasking -/// -///{ -int omp_in_final(void); - -int omp_get_max_task_priority(void); -///} - -/// Misc -/// -///{ -double omp_get_wtick(void); - -double omp_get_wtime(void); -///} -} - -extern "C" { -/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be -/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be -/// called by any thread, allocation happens *per thread*. -void *__kmpc_alloc_shared(uint64_t Bytes); - -/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like -/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the -/// allocated by __kmpc_alloc_shared by the same thread. -void __kmpc_free_shared(void *Ptr, uint64_t Bytes); - -/// Get a pointer to the memory buffer containing dynamically allocated shared -/// memory configured at launch. -void *__kmpc_get_dynamic_shared(); - -/// Allocate sufficient space for \p NumArgs sequential `void*` and store the -/// allocation address in \p GlobalArgs. -/// -/// Called by the main thread prior to a parallel region. -/// -/// We also remember it in GlobalArgsPtr to ensure the worker threads and -/// deallocation function know the allocation address too. -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs); - -/// Deallocate the memory allocated by __kmpc_begin_sharing_variables. -/// -/// Called by the main thread after a parallel region. -void __kmpc_end_sharing_variables(); - -/// Store the allocation address obtained via __kmpc_begin_sharing_variables in -/// \p GlobalArgs. -/// -/// Called by the worker threads in the parallel region (function). -void __kmpc_get_shared_variables(void ***GlobalArgs); - -/// External interface to get the thread ID. -uint32_t __kmpc_get_hardware_thread_id_in_block(); - -/// External interface to get the number of threads. -uint32_t __kmpc_get_hardware_num_threads_in_block(); - -/// External interface to get the warp size. -uint32_t __kmpc_get_warp_size(); - -/// Kernel -/// -///{ -// Forward declaration -struct KernelEnvironmentTy; - -int8_t __kmpc_is_spmd_exec_mode(); - -int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); - -void __kmpc_target_deinit(); - -///} - -/// Reduction -/// -///{ -void *__kmpc_reduction_get_fixed_buffer(); - -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, - uint64_t reduce_data_size, - void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct); - -int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, - uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, - ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct); -///} - -/// Synchronization -/// -///{ -void __kmpc_ordered(IdentTy *Loc, int32_t TId); - -void __kmpc_end_ordered(IdentTy *Loc, int32_t TId); - -int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId); - -void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId); - -void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId); - -void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId); - -int32_t __kmpc_master(IdentTy *Loc, int32_t TId); - -void __kmpc_end_master(IdentTy *Loc, int32_t TId); - -int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter); - -void __kmpc_end_masked(IdentTy *Loc, int32_t TId); - -int32_t __kmpc_single(IdentTy *Loc, int32_t TId); - -void __kmpc_end_single(IdentTy *Loc, int32_t TId); - -void __kmpc_flush(IdentTy *Loc); - -uint64_t __kmpc_warp_active_thread_mask(void); - -void __kmpc_syncwarp(uint64_t Mask); - -void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); - -void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); -///} - -/// Parallelism -/// -///{ -/// TODO -void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn); - -/// TODO -bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn); - -/// TODO -void __kmpc_kernel_end_parallel(); - -/// TODO -void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind); - -/// TODO -void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams, - int32_t ThreadLimit); - -/// TODO -uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t); - -///} - -/// Tasking -/// -///{ -TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, - size_t TaskSizeInclPrivateValues, - size_t SharedValuesSize, - TaskFnTy TaskFn); - -int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); - -int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int32_t, - void *, int32_t, void *); - -void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); - -void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); - -void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, - void *); - -void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId); - -void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId); - -int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int); - -int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId); - -void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int, - uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, - int32_t, uint64_t, void *); -///} - -/// Misc -/// -///{ -int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal); - -int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal); -///} - -/// Shuffle -/// -///{ -int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); -int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); - -///} -} - -#endif diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h deleted file mode 100644 index 94b5e6519606..000000000000 --- a/offload/DeviceRTL/include/LibC.h +++ /dev/null @@ -1,23 +0,0 @@ -//===--------- LibC.h - Simple implementation of libc functions --- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_LIBC_H -#define OMPTARGET_LIBC_H - -#include "DeviceTypes.h" - -namespace ompx { - -int printf(const char *Format, ...); - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h deleted file mode 100644 index 8ba018b5314a..000000000000 --- a/offload/DeviceRTL/include/Mapping.h +++ /dev/null @@ -1,108 +0,0 @@ -//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_MAPPING_H -#define OMPTARGET_MAPPING_H - -#include "DeviceTypes.h" - -namespace ompx { - -namespace mapping { - -enum { - DIM_X = __GPU_X_DIM, - DIM_Y = __GPU_Y_DIM, - DIM_Z = __GPU_Z_DIM, -}; - -inline constexpr uint32_t MaxThreadsPerTeam = 1024; - -/// Initialize the mapping machinery. -void init(bool IsSPMD); - -/// Return true if the kernel is executed in SPMD mode. -bool isSPMDMode(); - -/// Return true if the kernel is executed in generic mode. -bool isGenericMode(); - -/// Return true if the executing thread is the main thread in generic mode. -/// These functions will lookup state and it is required that that is OK for the -/// thread and location. See also `isInitialThreadInLevel0` for a stateless -/// alternative for certain situations, e.g. during initialization. -bool isMainThreadInGenericMode(); -bool isMainThreadInGenericMode(bool IsSPMD); - -/// Return true if this thread is the initial thread in parallel level 0. -/// -/// The thread for which this returns true should be used for single threaded -/// initialization tasks. We pick a special thread to ensure there are no -/// races between the initialization and the first read of initialized state. -bool isInitialThreadInLevel0(bool IsSPMD); - -/// Return true if the executing thread has the lowest Id of the active threads -/// in the warp. -bool isLeaderInWarp(); - -/// Return a mask describing all active threads in the warp. -LaneMaskTy activemask(); - -/// Return a mask describing all threads with a smaller Id in the warp. -LaneMaskTy lanemaskLT(); - -/// Return a mask describing all threads with a larger Id in the warp. -LaneMaskTy lanemaskGT(); - -/// Return the thread Id in the warp, in [0, getWarpSize()). -uint32_t getThreadIdInWarp(); - -/// Return the warp size, thus number of threads in the warp. -uint32_t getWarpSize(); - -/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()] -uint32_t getWarpIdInBlock(); - -/// Return the number of warps in the block. -uint32_t getNumberOfWarpsInBlock(); - -/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)). -uint32_t getThreadIdInBlock(int32_t Dim = DIM_X); - -/// Return the block size, thus number of threads in the block. -uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X); - -/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)). -uint32_t getBlockIdInKernel(int32_t Dim = DIM_X); - -/// Return the number of blocks in the kernel. -uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X); - -/// Return the kernel size, thus number of threads in the kernel. -uint32_t getNumberOfThreadsInKernel(); - -/// Return the maximal number of threads in the block usable for a team (= -/// parallel region). -/// -/// Note: The version taking \p IsSPMD mode explicitly can be used during the -/// initialization of the target region, that is before `mapping::isSPMDMode()` -/// can be called by any thread other than the main one. -uint32_t getMaxTeamThreads(); -uint32_t getMaxTeamThreads(bool IsSPMD); - -/// Return the number of processing elements on the device. -uint32_t getNumberOfProcessorElements(); - -} // namespace mapping - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h deleted file mode 100644 index d99475225412..000000000000 --- a/offload/DeviceRTL/include/Profiling.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_PROFILING_H -#define OMPTARGET_DEVICERTL_PROFILING_H - -extern "C" { -void __llvm_profile_register_function(void *Ptr); -void __llvm_profile_register_names_function(void *Ptr, long int I); -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2); -} - -#endif diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h deleted file mode 100644 index db396dae6e44..000000000000 --- a/offload/DeviceRTL/include/State.h +++ /dev/null @@ -1,377 +0,0 @@ -//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_STATE_H -#define OMPTARGET_STATE_H - -#include "Shared/Environment.h" - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Mapping.h" - -// Forward declaration. -struct KernelEnvironmentTy; - -namespace ompx { - -namespace memory { - -/// Alloca \p Size bytes in shared memory, if possible, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_alloc_shared for proper usage. -void *allocShared(uint64_t Size, const char *Reason); - -/// Free \p Ptr, allocated via allocShared, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_free_shared for proper usage. -void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); - -/// Alloca \p Size bytes in global memory, if possible, for \p Reason. -void *allocGlobal(uint64_t Size, const char *Reason); - -/// Return a pointer to the dynamic shared memory buffer. -void *getDynamicBuffer(); - -/// Free \p Ptr, allocated via allocGlobal, for \p Reason. -void freeGlobal(void *Ptr, const char *Reason); - -} // namespace memory - -namespace state { - -inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; - -struct ICVStateTy { - uint32_t NThreadsVar; - uint32_t LevelVar; - uint32_t ActiveLevelVar; - uint32_t Padding0Val; - uint32_t MaxActiveLevelsVar; - uint32_t RunSchedVar; - uint32_t RunSchedChunkVar; - - bool operator==(const ICVStateTy &Other) const; - - void assertEqual(const ICVStateTy &Other) const; -}; - -struct TeamStateTy { - void init(bool IsSPMD); - - bool operator==(const TeamStateTy &) const; - - void assertEqual(TeamStateTy &Other) const; - - /// ICVs - /// - /// Preallocated storage for ICV values that are used if the threads have not - /// set a custom default. The latter is supported but unlikely and slow(er). - /// - ///{ - ICVStateTy ICVState; - ///} - - uint32_t ParallelTeamSize; - uint32_t HasThreadState; - ParallelRegionFnTy ParallelRegionFnVar; -}; - -extern Local<TeamStateTy> TeamState; - -struct ThreadStateTy { - - /// ICVs have preallocated storage in the TeamStateTy which is used if a - /// thread has not set a custom value. The latter is supported but unlikely. - /// When it happens we will allocate dynamic memory to hold the values of all - /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an - /// ICV struct to hold them all. This is slower than alternatives but allows - /// users to pay only for what they use. - /// - state::ICVStateTy ICVState; - - ThreadStateTy *PreviousThreadState; - - void init() { - ICVState = TeamState.ICVState; - PreviousThreadState = nullptr; - } - - void init(ThreadStateTy *PreviousTS) { - ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; - PreviousThreadState = PreviousTS; - } -}; - -extern Local<ThreadStateTy **> ThreadStates; - -/// Initialize the state machinery. Must be called by all threads. -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); - -/// Return the kernel and kernel launch environment associated with the current -/// kernel. The former is static and contains compile time information that -/// holds for all instances of the kernel. The latter is dynamic and provides -/// per-launch information. -KernelEnvironmentTy &getKernelEnvironment(); -KernelLaunchEnvironmentTy &getKernelLaunchEnvironment(); - -/// TODO -enum ValueKind { - VK_NThreads, - VK_Level, - VK_ActiveLevel, - VK_MaxActiveLevels, - VK_RunSched, - // --- - VK_RunSchedChunk, - VK_ParallelRegionFn, - VK_ParallelTeamSize, - VK_HasThreadState, -}; - -/// TODO -void enterDataEnvironment(IdentTy *Ident); - -/// TODO -void exitDataEnvironment(); - -/// TODO -struct DateEnvironmentRAII { - DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); } - ~DateEnvironmentRAII() { exitDataEnvironment(); } -}; - -/// TODO -void resetStateForThread(uint32_t TId); - -// FIXME: https://github.com/llvm/llvm-project/issues/123241. -#define lookupForModify32Impl(Member, Ident, ForceTeamState) \ - { \ - if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() || \ - !TeamState.HasThreadState)) \ - return TeamState.ICVState.Member; \ - uint32_t TId = mapping::getThreadIdInBlock(); \ - if (OMP_UNLIKELY(!ThreadStates[TId])) { \ - ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>( \ - memory::allocGlobal(sizeof(ThreadStateTy), \ - "ICV modification outside data environment")); \ - ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!"); \ - TeamState.HasThreadState = true; \ - ThreadStates[TId]->init(); \ - } \ - return ThreadStates[TId]->ICVState.Member; \ - } - -// FIXME: https://github.com/llvm/llvm-project/issues/123241. -#define lookupImpl(Member, ForceTeamState) \ - { \ - auto TId = mapping::getThreadIdInBlock(); \ - if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() && \ - TeamState.HasThreadState && ThreadStates[TId])) \ - return ThreadStates[TId]->ICVState.Member; \ - return TeamState.ICVState.Member; \ - } - -[[gnu::always_inline, gnu::flatten]] inline uint32_t & -lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { - switch (Kind) { - case state::VK_NThreads: - if (IsReadonly) - lookupImpl(NThreadsVar, ForceTeamState); - lookupForModify32Impl(NThreadsVar, Ident, ForceTeamState); - case state::VK_Level: - if (IsReadonly) - lookupImpl(LevelVar, ForceTeamState); - lookupForModify32Impl(LevelVar, Ident, ForceTeamState); - case state::VK_ActiveLevel: - if (IsReadonly) - lookupImpl(ActiveLevelVar, ForceTeamState); - lookupForModify32Impl(ActiveLevelVar, Ident, ForceTeamState); - case state::VK_MaxActiveLevels: - if (IsReadonly) - lookupImpl(MaxActiveLevelsVar, ForceTeamState); - lookupForModify32Impl(MaxActiveLevelsVar, Ident, ForceTeamState); - case state::VK_RunSched: - if (IsReadonly) - lookupImpl(RunSchedVar, ForceTeamState); - lookupForModify32Impl(RunSchedVar, Ident, ForceTeamState); - case state::VK_RunSchedChunk: - if (IsReadonly) - lookupImpl(RunSchedChunkVar, ForceTeamState); - lookupForModify32Impl(RunSchedChunkVar, Ident, ForceTeamState); - case state::VK_ParallelTeamSize: - return TeamState.ParallelTeamSize; - case state::VK_HasThreadState: - return TeamState.HasThreadState; - default: - break; - } - __builtin_unreachable(); -} - -[[gnu::always_inline, gnu::flatten]] inline void *& -lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) { - switch (Kind) { - case state::VK_ParallelRegionFn: - return TeamState.ParallelRegionFnVar; - default: - break; - } - __builtin_unreachable(); -} - -/// A class without actual state used to provide a nice interface to lookup and -/// update ICV values we can declare in global scope. -template <typename Ty, ValueKind Kind> struct Value { - [[gnu::flatten, gnu::always_inline]] operator Ty() { - return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, - /*ForceTeamState=*/false); - } - - [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) { - set(Other, /*IdentTy=*/nullptr); - return *this; - } - - [[gnu::flatten, gnu::always_inline]] Value &operator++() { - inc(1, /*IdentTy=*/nullptr); - return *this; - } - - [[gnu::flatten, gnu::always_inline]] Value &operator--() { - inc(-1, /*IdentTy=*/nullptr); - return *this; - } - - [[gnu::flatten, gnu::always_inline]] void - assert_eq(const Ty &V, IdentTy *Ident = nullptr, - bool ForceTeamState = false) { - ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr); - } - -private: - [[gnu::flatten, gnu::always_inline]] Ty & - lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { - Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState); - return t; - } - - [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) { - return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) += - UpdateVal); - } - - [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) { - return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) = - UpdateVal); - } - - template <typename VTy, typename Ty2> friend struct ValueRAII; -}; - -/// A mookup class without actual state used to provide -/// a nice interface to lookup and update ICV values -/// we can declare in global scope. -template <typename Ty, ValueKind Kind> struct PtrValue { - [[gnu::flatten, gnu::always_inline]] operator Ty() { - return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, - /*ForceTeamState=*/false); - } - - [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) { - set(Other); - return *this; - } - -private: - Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) { - return lookupPtr(Kind, IsReadonly, ForceTeamState); - } - - Ty &set(Ty UpdateVal) { - return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr, - /*ForceTeamState=*/false) = UpdateVal); - } - - template <typename VTy, typename Ty2> friend struct ValueRAII; -}; - -template <typename VTy, typename Ty> struct ValueRAII { - ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident, - bool ForceTeamState = false) - : Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState) - : (Ty *)utils::UndefPtr), - Val(OldValue), Active(Active) { - if (!Active) - return; - ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!"); - *Ptr = NewValue; - } - ~ValueRAII() { - if (Active) - *Ptr = Val; - } - -private: - Ty *Ptr; - Ty Val; - bool Active; -}; - -/// TODO -inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk; - -/// TODO -inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize; - -/// TODO -inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState; - -/// TODO -inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn> - ParallelRegionFn; - -void runAndCheckState(void(Func(void))); - -void assumeInitialState(bool IsSPMD); - -/// Return the value of the ParallelTeamSize ICV. -int getEffectivePTeamSize(); - -} // namespace state - -namespace icv { - -/// TODO -inline state::Value<uint32_t, state::VK_NThreads> NThreads; - -/// TODO -inline state::Value<uint32_t, state::VK_Level> Level; - -/// The `active-level` describes which of the parallel level counted with the -/// `level-var` is active. There can only be one. -/// -/// active-level-var is 1, if ActiveLevelVar is not 0, otherwise it is 0. -inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel; - -/// TODO -inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels; - -/// TODO -inline state::Value<uint32_t, state::VK_RunSched> RunSched; - -} // namespace icv - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h deleted file mode 100644 index 7e7c8eacb917..000000000000 --- a/offload/DeviceRTL/include/Synchronization.h +++ /dev/null @@ -1,225 +0,0 @@ -//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H -#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H - -#include "DeviceTypes.h" -#include "DeviceUtils.h" - -namespace ompx { -namespace atomic { - -enum OrderingTy { - relaxed = __ATOMIC_RELAXED, - acquire = __ATOMIC_ACQUIRE, - release = __ATOMIC_RELEASE, - acq_rel = __ATOMIC_ACQ_REL, - seq_cst = __ATOMIC_SEQ_CST, -}; - -enum MemScopeTy { - system = __MEMORY_SCOPE_SYSTEM, - device = __MEMORY_SCOPE_DEVICE, - workgroup = __MEMORY_SCOPE_WRKGRP, - wavefront = __MEMORY_SCOPE_WVFRNT, - single = __MEMORY_SCOPE_SINGLE, -}; - -/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device); - -/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The -/// result is stored in \p *Addr; -/// { - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc, - atomic::OrderingTy OrderingFail, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, - OrderingSucc, OrderingFail, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V add(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V load(Ty *Address, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { -#ifdef __NVPTX__ - return __scoped_atomic_fetch_add(Address, V(0), Ordering, MemScope); -#else - return __scoped_atomic_load_n(Address, Ordering, MemScope); -#endif -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -void store(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - __scoped_atomic_store_n(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V mul(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - Ty TypedCurrentVal, TypedResultVal, TypedNewVal; - bool Success; - do { - TypedCurrentVal = atomic::load(Address, Ordering); - TypedNewVal = TypedCurrentVal * Val; - Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, - atomic::relaxed, MemScope); - } while (!Success); - return TypedResultVal; -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<!utils::is_floating_point_v<V>, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, float>, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<float>(max( - (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); - return utils::bitCast<float>(min( - (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, double>, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<double>(max( - (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); - return utils::bitCast<double>(min( - (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<!utils::is_floating_point_v<V>, V> -min(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope); -} - -// TODO: Implement this with __atomic_fetch_max and remove the duplication. -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, float>, V> -min(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<float>(min( - (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); - return utils::bitCast<float>(max( - (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); -} - -// TODO: Implement this with __atomic_fetch_max and remove the duplication. -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -utils::enable_if_t<utils::is_same_v<V, double>, V> -min(Ty *Address, utils::remove_addrspace_t<Ty> Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - if (Val >= 0) - return utils::bitCast<double>(min( - (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); - return utils::bitCast<double>(max( - (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope); -} - -template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> -V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope); -} - -static inline uint32_t -atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::device) { - uint32_t R; - __scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope); - return R; -} - -///} - -} // namespace atomic - -namespace synchronize { - -/// Initialize the synchronization machinery. Must be called by all threads. -void init(bool IsSPMD); - -/// Synchronize all threads in a warp identified by \p Mask. -void warp(LaneMaskTy Mask); - -/// Synchronize all threads in a block and perform a fence before and after the -/// barrier according to \p Ordering. Note that the fence might be part of the -/// barrier. -void threads(atomic::OrderingTy Ordering); - -/// Synchronizing threads is allowed even if they all hit different instances of -/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more -/// restrictive in that it requires all threads to hit the same instance. The -/// noinline is removed by the openmp-opt pass and helps to preserve the -/// information till then. -///{ - -/// Synchronize all threads in a block, they are reaching the same instruction -/// (hence all threads in the block are "aligned"). Also perform a fence before -/// and after the barrier according to \p Ordering. Note that the -/// fence might be part of the barrier if the target offers this. -[[gnu::noinline, omp::assume("ompx_aligned_barrier")]] void -threadsAligned(atomic::OrderingTy Ordering); - -///} - -} // namespace synchronize - -namespace fence { - -/// Memory fence with \p Ordering semantics for the team. -void team(atomic::OrderingTy Ordering); - -/// Memory fence with \p Ordering semantics for the contention group. -void kernel(atomic::OrderingTy Ordering); - -/// Memory fence with \p Ordering semantics for the system. -void system(atomic::OrderingTy Ordering); - -} // namespace fence - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h deleted file mode 100644 index 554c3271c334..000000000000 --- a/offload/DeviceRTL/include/Workshare.h +++ /dev/null @@ -1,26 +0,0 @@ -//===-------- Workshare.h - OpenMP Workshare interface ------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_WORKSHARE_H -#define OMPTARGET_WORKSHARE_H - -namespace ompx { - -namespace workshare { - -/// Initialize the worksharing machinery. -void init(bool IsSPMD); - -} // namespace workshare - -} // namespace ompx - -#endif diff --git a/offload/DeviceRTL/include/generated_microtask_cases.gen b/offload/DeviceRTL/include/generated_microtask_cases.gen deleted file mode 100644 index a05f6da2f84f..000000000000 --- a/offload/DeviceRTL/include/generated_microtask_cases.gen +++ /dev/null @@ -1,797 +0,0 @@ -case 0: -((void (*)(int32_t *, int32_t *))fn)(&global_tid, &bound_tid); -break; -case 1: -((void (*)(int32_t *, int32_t *, void *))fn)(&global_tid, &bound_tid, args[0]); -break; -case 2: -((void (*)(int32_t *, int32_t *, void *, void *))fn)(&global_tid, &bound_tid, - args[0], args[1]); -break; -case 3: -((void (*)(int32_t *, int32_t *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2]); -break; -case 4: -((void (*)(int32_t *, int32_t *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3]); -break; -case 5: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4]); -break; -case 6: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5]); -break; -case 7: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6]); -break; -case 8: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7]); -break; -case 9: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8]); -break; -case 10: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], - args[4], args[5], args[6], - args[7], args[8], args[9]); -break; -case 11: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10]); -break; -case 12: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11]); -break; -case 13: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12]); -break; -case 14: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13]); -break; -case 15: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14]); -break; -case 16: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15]); -break; -case 17: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16]); -break; -case 18: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17]); -break; -case 19: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18]); -break; -case 20: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19]); -break; -case 21: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20]); -break; -case 22: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21]); -break; -case 23: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22]); -break; -case 24: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23]); -break; -case 25: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)(&global_tid, &bound_tid, args[0], - args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], - args[21], args[22], args[23], args[24]); -break; -case 26: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25]); -break; -case 27: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26]); -break; -case 28: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27]); -break; -case 29: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28]); -break; -case 30: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29]); -break; -case 31: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30]); -break; -case 32: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], - args[31]); -break; -case 33: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32]); -break; -case 34: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33]); -break; -case 35: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34]); -break; -case 36: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35]); -break; -case 37: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36]); -break; -case 38: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37]); -break; -case 39: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38]); -break; -case 40: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], - args[32], args[33], args[34], args[35], args[36], - args[37], args[38], args[39]); -break; -case 41: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40]); -break; -case 42: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41]); -break; -case 43: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42]); -break; -case 44: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43]); -break; -case 45: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44]); -break; -case 46: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45]); -break; -/// DONE TO HERE -case 47: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], - args[39], args[40], args[41], args[42], args[43], - args[44], args[45], args[46]); -break; -case 48: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47]); -break; -case 49: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48]); -break; -case 50: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49]); -break; -case 51: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50]); -break; -case 52: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51]); -break; -case 53: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52]); -break; -case 54: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53]); -break; -case 55: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54]); -break; -case 56: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1], - args[2], args[3], args[4], args[5], args[6], - args[7], args[8], args[9], args[10], args[11], - args[12], args[13], args[14], args[15], args[16], - args[17], args[18], args[19], args[20], args[21], - args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], - args[32], args[33], args[34], args[35], args[36], - args[37], args[38], args[39], args[40], args[41], - args[42], args[43], args[44], args[45], args[46], - args[47], args[48], args[49], args[50], args[51], - args[52], args[53], args[54], args[55]); -break; -case 57: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56]); -break; -case 58: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57]); -break; -case 59: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58]); -break; -case 60: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59]); -break; -case 61: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59], args[60]); -break; -case 62: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59], args[60], args[61]); -break; -case 63: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2], - args[3], args[4], args[5], args[6], args[7], args[8], - args[9], args[10], args[11], args[12], args[13], - args[14], args[15], args[16], args[17], args[18], - args[19], args[20], args[21], args[22], args[23], - args[24], args[25], args[26], args[27], args[28], - args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], - args[39], args[40], args[41], args[42], args[43], - args[44], args[45], args[46], args[47], args[48], - args[49], args[50], args[51], args[52], args[53], - args[54], args[55], args[56], args[57], args[58], - args[59], args[60], args[61], args[62]); -break; -case 64: -((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *, void *, void *, void *, void *, void *, void *, - void *, void *))fn)( - &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], - args[13], args[14], args[15], args[16], args[17], args[18], args[19], - args[20], args[21], args[22], args[23], args[24], args[25], args[26], - args[27], args[28], args[29], args[30], args[31], args[32], args[33], - args[34], args[35], args[36], args[37], args[38], args[39], args[40], - args[41], args[42], args[43], args[44], args[45], args[46], args[47], - args[48], args[49], args[50], args[51], args[52], args[53], args[54], - args[55], args[56], args[57], args[58], args[59], args[60], args[61], - args[62], args[63]); -break; diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp deleted file mode 100644 index aac2a6005158..000000000000 --- a/offload/DeviceRTL/src/Allocator.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Configuration.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Mapping.h" -#include "Synchronization.h" - -using namespace ompx; - -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool; -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy - __omp_rtl_device_memory_pool_tracker; - -/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool -/// directly. -struct BumpAllocatorTy final { - - void *alloc(uint64_t Size) { - Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT)); - - if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) { - atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1, - atomic::seq_cst); - atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size, - atomic::seq_cst); - atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size, - atomic::seq_cst); - atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size, - atomic::seq_cst); - } - - uint64_t *Data = - reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr); - uint64_t End = - reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size; - - uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst); - if (OldData + Size > End) - __builtin_trap(); - - return reinterpret_cast<void *>(OldData); - } - - void free(void *) {} -}; - -BumpAllocatorTy BumpAllocator; - -/// allocator namespace implementation -/// -///{ - -void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) { - // TODO: Check KernelEnvironment for an allocator choice as soon as we have - // more than one. -} - -void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); } - -void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } - -///} diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp deleted file mode 100644 index 0c31c66ab2de..000000000000 --- a/offload/DeviceRTL/src/Configuration.cpp +++ /dev/null @@ -1,85 +0,0 @@ -//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the data object of the constant device environment and the -// query API. -// -//===----------------------------------------------------------------------===// - -#include "Configuration.h" -#include "DeviceTypes.h" -#include "State.h" - -using namespace ompx; - -// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled. -[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_nested_parallelism = 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_threads_oversubscription = - 0; -[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0; - -// This variable should be visible to the plugin so we override the default -// hidden visibility. -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment; - -uint32_t config::getAssumeTeamsOversubscription() { - return __omp_rtl_assume_teams_oversubscription; -} - -uint32_t config::getAssumeThreadsOversubscription() { - return __omp_rtl_assume_threads_oversubscription; -} - -uint32_t config::getDebugKind() { - return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind; -} - -uint32_t config::getNumDevices() { - return __omp_rtl_device_environment.NumDevices; -} - -uint32_t config::getDeviceNum() { - return __omp_rtl_device_environment.DeviceNum; -} - -uint64_t config::getDynamicMemorySize() { - return __omp_rtl_device_environment.DynamicMemSize; -} - -uint64_t config::getClockFrequency() { - return __omp_rtl_device_environment.ClockFrequency; -} - -void *config::getIndirectCallTablePtr() { - return reinterpret_cast<void *>( - __omp_rtl_device_environment.IndirectCallTable); -} - -uint64_t config::getHardwareParallelism() { - return __omp_rtl_device_environment.HardwareParallelism; -} - -uint64_t config::getIndirectCallTableSize() { - return __omp_rtl_device_environment.IndirectCallTableSize; -} - -bool config::isDebugMode(DeviceDebugKind Kind) { - return config::getDebugKind() & uint32_t(Kind); -} - -bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } - -bool config::mayUseNestedParallelism() { - if (__omp_rtl_assume_no_nested_parallelism) - return false; - return state::getKernelEnvironment().Configuration.MayUseNestedParallelism; -} diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp deleted file mode 100644 index 5b5482d766b1..000000000000 --- a/offload/DeviceRTL/src/Debug.cpp +++ /dev/null @@ -1,44 +0,0 @@ -//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains debug utilities -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Configuration.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" - -using namespace ompx; - -extern "C" { -void __assert_assume(bool condition) { __builtin_assume(condition); } - -#ifndef OMPTARGET_HAS_LIBC -[[gnu::weak]] void __assert_fail(const char *expr, const char *file, - unsigned line, const char *function) { - __assert_fail_internal(expr, nullptr, file, line, function); -} -#endif - -void __assert_fail_internal(const char *expr, const char *msg, const char *file, - unsigned line, const char *function) { - if (msg) { - printf("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function, - msg, expr); - } else { - printf("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr); - } - __builtin_trap(); -} -} diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp deleted file mode 100644 index d6f8c499c890..000000000000 --- a/offload/DeviceRTL/src/DeviceUtils.cpp +++ /dev/null @@ -1,64 +0,0 @@ -//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "DeviceUtils.h" - -#include "Debug.h" -#include "Interface.h" -#include "Mapping.h" -#include "gpuintrin.h" - -using namespace ompx; - -uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { - return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; -} - -void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { - static_assert(sizeof(unsigned long) == 8, ""); - LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL); - HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32); -} - -int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, - int32_t Width) { - return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width); -} - -int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, - int32_t Width) { - int32_t Self = mapping::getThreadIdInWarp(); - int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; - return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); -} - -int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, - int32_t Width) { - int32_t Self = mapping::getThreadIdInWarp(); - int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; - return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); -} - -uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { - return __gpu_ballot(Mask, Pred); -} - -bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); } - -extern "C" { -int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { - return utils::shuffleDown(lanes::All, Val, Delta, SrcLane); -} - -int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { - return utils::shuffleDown(lanes::All, Val, Delta, Width); -} -} diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp deleted file mode 100644 index 467e44a65276..000000000000 --- a/offload/DeviceRTL/src/Kernel.cpp +++ /dev/null @@ -1,162 +0,0 @@ -//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the kernel entry points for the device. -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" -#include "Workshare.h" - -using namespace ompx; - -// These flags are copied from "llvm/Frontend/OpenMP/OMPDeviceConstants.h" and -// must be kept in-sync. -enum OMPTgtExecModeFlags : unsigned char { - OMP_TGT_EXEC_MODE_BARE = 0, - OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, - OMP_TGT_EXEC_MODE_SPMD = 1 << 1, - OMP_TGT_EXEC_MODE_GENERIC_SPMD = - OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD -}; - -static void -inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - // Order is important here. - synchronize::init(IsSPMD); - mapping::init(IsSPMD); - state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment); - allocator::init(IsSPMD, KernelEnvironment); - workshare::init(IsSPMD); -} - -/// Simple generic state machine for worker threads. -static void genericStateMachine(IdentTy *Ident) { - uint32_t TId = mapping::getThreadIdInBlock(); - - do { - ParallelRegionFnTy WorkFn = nullptr; - - // Wait for the signal that we have a new work function. - synchronize::threads(atomic::seq_cst); - - // Retrieve the work function from the runtime. - bool IsActive = __kmpc_kernel_parallel(&WorkFn); - - // If there is nothing more to do, break out of the state machine by - // returning to the caller. - if (!WorkFn) - return; - - if (IsActive) { - ASSERT(!mapping::isSPMDMode(), nullptr); - ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId); - __kmpc_kernel_end_parallel(); - } - - synchronize::threads(atomic::seq_cst); - - } while (true); -} - -extern "C" { - -/// Initialization -/// -/// \param Ident Source location identification, can be NULL. -/// -int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; - bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD; - bool UseGenericStateMachine = Configuration.UseGenericStateMachine; - if (IsSPMD) { - inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, - KernelLaunchEnvironment); - synchronize::threadsAligned(atomic::relaxed); - } else { - inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, - KernelLaunchEnvironment); - // No need to wait since only the main threads will execute user - // code and workers will run into a barrier right away. - } - - if (IsSPMD) { - state::assumeInitialState(IsSPMD); - - // Synchronize to ensure the assertions above are in an aligned region. - // The barrier is eliminated later. - synchronize::threadsAligned(atomic::relaxed); - return -1; - } - - if (mapping::isInitialThreadInLevel0(IsSPMD)) - return -1; - - // Enter the generic state machine if enabled and if this thread can possibly - // be an active worker thread. - // - // The latter check is important for NVIDIA Pascal (but not Volta) and AMD - // GPU. In those cases, a single thread can apparently satisfy a barrier on - // behalf of all threads in the same warp. Thus, it would not be safe for - // other threads in the main thread's warp to reach the first - // synchronize::threads call in genericStateMachine before the main thread - // reaches its corresponding synchronize::threads call: that would permit all - // active worker threads to proceed before the main thread has actually set - // state::ParallelRegionFn, and then they would immediately quit without - // doing any work. mapping::getMaxTeamThreads() does not include any of the - // main thread's warp, so none of its threads can ever be active worker - // threads. - if (UseGenericStateMachine && - mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) - genericStateMachine(KernelEnvironment.Ident); - - return mapping::getThreadIdInBlock(); -} - -/// De-Initialization -/// -/// In non-SPMD, this function releases the workers trapped in a state machine -/// and also any memory dynamically allocated by the runtime. -/// -/// \param Ident Source location identification, can be NULL. -/// -void __kmpc_target_deinit() { - bool IsSPMD = mapping::isSPMDMode(); - if (IsSPMD) - return; - - if (mapping::isInitialThreadInLevel0(IsSPMD)) { - // Signal the workers to exit the state machine and exit the kernel. - state::ParallelRegionFn = nullptr; - } else if (!state::getKernelEnvironment() - .Configuration.UseGenericStateMachine) { - // Retrieve the work function just to ensure we always call - // __kmpc_kernel_parallel even if a custom state machine is used. - // TODO: this is not super pretty. The problem is we create the call to - // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it - // is not there yet. Thus, we assume we never reach it from - // __kmpc_target_deinit. That allows us to remove the store in there to - // ParallelRegionFn, which leads to bad results later on. - ParallelRegionFnTy WorkFn = nullptr; - __kmpc_kernel_parallel(&WorkFn); - ASSERT(WorkFn == nullptr, nullptr); - } -} - -int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } -} diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp deleted file mode 100644 index 83f9233d9480..000000000000 --- a/offload/DeviceRTL/src/LibC.cpp +++ /dev/null @@ -1,48 +0,0 @@ -//===------- LibC.cpp - Simple implementation of libc functions --- C++ ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "LibC.h" - -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) -extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; } -#else -extern "C" int vprintf(const char *format, __builtin_va_list); -#endif - -extern "C" { -[[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) { - auto *L = reinterpret_cast<const unsigned char *>(lhs); - auto *R = reinterpret_cast<const unsigned char *>(rhs); - - for (size_t I = 0; I < count; ++I) - if (L[I] != R[I]) - return (int)L[I] - (int)R[I]; - - return 0; -} - -[[gnu::weak]] void memset(void *dst, int C, size_t count) { - auto *dstc = reinterpret_cast<char *>(dst); - for (size_t I = 0; I < count; ++I) - dstc[I] = C; -} - -[[gnu::weak]] int printf(const char *Format, ...) { - __builtin_va_list vlist; - __builtin_va_start(vlist, Format); - return ::vprintf(Format, vlist); -} -} - -namespace ompx { -[[clang::no_builtin("printf")]] int printf(const char *Format, ...) { - __builtin_va_list vlist; - __builtin_va_start(vlist, Format); - return ::vprintf(Format, vlist); -} -} // namespace ompx diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp deleted file mode 100644 index b145892d1ece..000000000000 --- a/offload/DeviceRTL/src/Mapping.cpp +++ /dev/null @@ -1,212 +0,0 @@ -//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "Mapping.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "State.h" -#include "gpuintrin.h" - -using namespace ompx; - -// FIXME: This resolves the handling for the AMDGPU workgroup size when the ABI -// is set to 'none'. We only support COV5+ but this can be removed when COV4 is -// fully deprecated. -#ifdef __AMDGPU__ -extern const inline uint32_t __oclc_ABI_version = 500; -[[gnu::alias("__oclc_ABI_version")]] const uint32_t __oclc_ABI_version__; -#endif - -static bool isInLastWarp() { - uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) & - ~(mapping::getWarpSize() - 1); - return mapping::getThreadIdInBlock() == MainTId; -} - -bool mapping::isMainThreadInGenericMode(bool IsSPMD) { - if (IsSPMD || icv::Level) - return false; - - // Check if this is the last warp in the block. - return isInLastWarp(); -} - -bool mapping::isMainThreadInGenericMode() { - return mapping::isMainThreadInGenericMode(mapping::isSPMDMode()); -} - -bool mapping::isInitialThreadInLevel0(bool IsSPMD) { - if (IsSPMD) - return mapping::getThreadIdInBlock() == 0; - return isInLastWarp(); -} - -bool mapping::isLeaderInWarp() { - __kmpc_impl_lanemask_t Active = mapping::activemask(); - __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); - return utils::popc(Active & LaneMaskLT) == 0; -} - -LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); } - -LaneMaskTy mapping::lanemaskLT() { -#ifdef __NVPTX__ - return __nvvm_read_ptx_sreg_lanemask_lt(); -#else - uint32_t Lane = mapping::getThreadIdInWarp(); - int64_t Ballot = mapping::activemask(); - uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; - return Mask & Ballot; -#endif -} - -LaneMaskTy mapping::lanemaskGT() { -#ifdef __NVPTX__ - return __nvvm_read_ptx_sreg_lanemask_gt(); -#else - uint32_t Lane = mapping::getThreadIdInWarp(); - if (Lane == (mapping::getWarpSize() - 1)) - return 0; - int64_t Ballot = mapping::activemask(); - uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); - return Mask & Ballot; -#endif -} - -uint32_t mapping::getThreadIdInWarp() { - uint32_t ThreadIdInWarp = __gpu_lane_id(); - ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr); - return ThreadIdInWarp; -} - -uint32_t mapping::getThreadIdInBlock(int32_t Dim) { - uint32_t ThreadIdInBlock = __gpu_thread_id(Dim); - return ThreadIdInBlock; -} - -uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); } - -uint32_t mapping::getMaxTeamThreads(bool IsSPMD) { - uint32_t BlockSize = mapping::getNumberOfThreadsInBlock(); - // If we are in SPMD mode, remove one warp. - return BlockSize - (!IsSPMD * mapping::getWarpSize()); -} -uint32_t mapping::getMaxTeamThreads() { - return mapping::getMaxTeamThreads(mapping::isSPMDMode()); -} - -uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) { - return __gpu_num_threads(Dim); -} - -uint32_t mapping::getNumberOfThreadsInKernel() { - return mapping::getNumberOfThreadsInBlock(0) * - mapping::getNumberOfBlocksInKernel(0) * - mapping::getNumberOfThreadsInBlock(1) * - mapping::getNumberOfBlocksInKernel(1) * - mapping::getNumberOfThreadsInBlock(2) * - mapping::getNumberOfBlocksInKernel(2); -} - -uint32_t mapping::getWarpIdInBlock() { - uint32_t WarpID = - mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); - ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr); - return WarpID; -} - -uint32_t mapping::getBlockIdInKernel(int32_t Dim) { - uint32_t BlockId = __gpu_block_id(Dim); - ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr); - return BlockId; -} - -uint32_t mapping::getNumberOfWarpsInBlock() { - return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); -} - -uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) { - return __gpu_num_blocks(Dim); -} - -uint32_t mapping::getNumberOfProcessorElements() { - return static_cast<uint32_t>(config::getHardwareParallelism()); -} - -///} - -/// Execution mode -/// -///{ - -// TODO: This is a workaround for initialization coming from kernels outside of -// the TU. We will need to solve this more correctly in the future. -[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode; - -void mapping::init(bool IsSPMD) { - if (mapping::isInitialThreadInLevel0(IsSPMD)) - IsSPMDMode = IsSPMD; -} - -bool mapping::isSPMDMode() { return IsSPMDMode; } - -bool mapping::isGenericMode() { return !isSPMDMode(); } -///} - -extern "C" { -[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() { - return mapping::getThreadIdInBlock(); -} - -[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() { - return mapping::getNumberOfThreadsInBlock(mapping::DIM_X); -} - -[[gnu::noinline]] uint32_t __kmpc_get_warp_size() { - return mapping::getWarpSize(); -} -} - -#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \ - extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); } - -_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock) -_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel) -_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock) -_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel) - -extern "C" { -uint64_t ompx_ballot_sync(uint64_t mask, int pred) { - return utils::ballotSync(mask, pred); -} - -int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) { - return utils::shuffleDown(mask, var, delta, width); -} - -float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta, - int width) { - return utils::bitCast<float>( - utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width)); -} - -long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) { - return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width); -} - -double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta, - int width) { - return utils::bitCast<double>( - utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width)); -} -} diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp deleted file mode 100644 index a89f8b2a7453..000000000000 --- a/offload/DeviceRTL/src/Misc.cpp +++ /dev/null @@ -1,138 +0,0 @@ -//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "Allocator.h" -#include "Configuration.h" -#include "DeviceTypes.h" -#include "Shared/RPCOpcodes.h" -#include "shared/rpc.h" - -#include "Debug.h" - -namespace ompx { -namespace impl { - -/// Lookup a device-side function using a host pointer /p HstPtr using the table -/// provided by the device plugin. The table is an ordered pair of host and -/// device pointers sorted on the value of the host pointer. -void *indirectCallLookup(void *HstPtr) { - if (!HstPtr) - return nullptr; - - struct IndirectCallTable { - void *HstPtr; - void *DevPtr; - }; - IndirectCallTable *Table = - reinterpret_cast<IndirectCallTable *>(config::getIndirectCallTablePtr()); - uint64_t TableSize = config::getIndirectCallTableSize(); - - // If the table is empty we assume this is device pointer. - if (!Table || !TableSize) - return HstPtr; - - uint32_t Left = 0; - uint32_t Right = TableSize; - - // If the pointer is definitely not contained in the table we exit early. - if (HstPtr < Table[Left].HstPtr || HstPtr > Table[Right - 1].HstPtr) - return HstPtr; - - while (Left != Right) { - uint32_t Current = Left + (Right - Left) / 2; - if (Table[Current].HstPtr == HstPtr) - return Table[Current].DevPtr; - - if (HstPtr < Table[Current].HstPtr) - Right = Current; - else - Left = Current; - } - - // If we searched the whole table and found nothing this is a device pointer. - return HstPtr; -} - -/// The openmp client instance used to communicate with the server. -[[gnu::visibility("protected"), - gnu::weak]] rpc::Client Client asm("__llvm_rpc_client"); - -} // namespace impl -} // namespace ompx - -/// Interfaces -/// -///{ - -extern "C" { -int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; } - -int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } - -double omp_get_wtick(void) { - // The number of ticks per second for the AMDGPU clock varies by card and can - // only be retrieved by querying the driver. We rely on the device environment - // to inform us what the proper frequency is. NVPTX uses a nanosecond - // resolution, we could omit the global read but this makes it consistent. - return 1.0 / ompx::config::getClockFrequency(); -} - -double omp_get_wtime(void) { - return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick(); -} - -void *__llvm_omp_indirect_call_lookup(void *HstPtr) { - return ompx::impl::indirectCallLookup(HstPtr); -} - -void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { - switch (allocator) { - case omp_default_mem_alloc: - case omp_large_cap_mem_alloc: - case omp_const_mem_alloc: - case omp_high_bw_mem_alloc: - case omp_low_lat_mem_alloc: - return malloc(size); - default: - return nullptr; - } -} - -void omp_free(void *ptr, omp_allocator_handle_t allocator) { - switch (allocator) { - case omp_default_mem_alloc: - case omp_large_cap_mem_alloc: - case omp_const_mem_alloc: - case omp_high_bw_mem_alloc: - case omp_low_lat_mem_alloc: - free(ptr); - case omp_null_allocator: - default: - return; - } -} - -unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) { - rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_HOST_CALL>(); - Port.send_n(data, size); - Port.send([=](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = reinterpret_cast<uintptr_t>(fn); - }); - unsigned long long Ret; - Port.recv([&](rpc::Buffer *Buffer, uint32_t) { - Ret = static_cast<unsigned long long>(Buffer->data[0]); - }); - Port.close(); - return Ret; -} -} - -///} diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp deleted file mode 100644 index 08ce616aee1c..000000000000 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ /dev/null @@ -1,311 +0,0 @@ -//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Parallel implementation in the GPU. Here is the pattern: -// -// while (not finished) { -// -// if (master) { -// sequential code, decide which par loop to do, or if finished -// __kmpc_kernel_prepare_parallel() // exec by master only -// } -// syncthreads // A -// __kmpc_kernel_parallel() // exec by all -// if (this thread is included in the parallel) { -// switch () for all parallel loops -// __kmpc_kernel_end_parallel() // exec only by threads in parallel -// } -// -// -// The reason we don't exec end_parallel for the threads not included -// in the parallel loop is that for each barrier in the parallel -// region, these non-included threads will cycle through the -// syncthread A. Thus they must preserve their current threadId that -// is larger than thread in team. -// -// To make a long story short... -// -//===----------------------------------------------------------------------===// - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -namespace { - -uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { - uint32_t NThreadsICV = - NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; - uint32_t NumThreads = mapping::getMaxTeamThreads(); - - if (NThreadsICV != 0 && NThreadsICV < NumThreads) - NumThreads = NThreadsICV; - - // SPMD mode allows any number of threads, for generic mode we round down to a - // multiple of WARPSIZE since it is legal to do so in OpenMP. - if (mapping::isSPMDMode()) - return NumThreads; - - if (NumThreads < mapping::getWarpSize()) - NumThreads = 1; - else - NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); - - return NumThreads; -} - -// Invoke an outlined parallel function unwrapping arguments (up to 32). -[[clang::always_inline]] void invokeMicrotask(int32_t global_tid, - int32_t bound_tid, void *fn, - void **args, int64_t nargs) { - switch (nargs) { -#include "generated_microtask_cases.gen" - default: - printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); - __builtin_trap(); - } -} - -} // namespace - -extern "C" { - -[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, - int32_t num_threads, - void *fn, void **args, - const int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); - uint32_t NumThreads = determineNumberOfThreads(num_threads); - uint32_t PTeamSize = - NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; - // Avoid the race between the read of the `icv::Level` above and the write - // below by synchronizing all threads here. - synchronize::threadsAligned(atomic::seq_cst); - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, - /*ForceTeamState=*/true); - - // Synchronize all threads after the main thread (TId == 0) set up the - // team state properly. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(PTeamSize, ident, - /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize before we run user code to avoid invalidating the - // assumptions above. - synchronize::threadsAligned(atomic::relaxed); - - if (!PTeamSize || TId < PTeamSize) - invokeMicrotask(TId, 0, fn, args, nargs); - - // Synchronize all threads at the end of a parallel region. - synchronize::threadsAligned(atomic::seq_cst); - } - - // Synchronize all threads to make sure every thread exits the scope above; - // otherwise the following assertions and the assumption in - // __kmpc_target_deinit may not hold. - synchronize::threadsAligned(atomic::acq_rel); - - state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true); - icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true); - icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true); - - // Ensure we synchronize to create an aligned region around the assumptions. - synchronize::threadsAligned(atomic::relaxed); - - return; -} - -[[clang::always_inline]] void -__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, - int32_t num_threads, int proc_bind, void *fn, - void *wrapper_fn, void **args, int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); - - // Assert the parallelism level is zero if disabled by the user. - ASSERT((config::mayUseNestedParallelism() || icv::Level == 0), - "nested parallelism while disabled"); - - // Handle the serialized case first, same for SPMD/non-SPMD: - // 1) if-clause(0) - // 2) parallel in task or other thread state inducing construct - // 3) nested parallel regions - if (OMP_UNLIKELY(!if_expr || state::HasThreadState || - (config::mayUseNestedParallelism() && icv::Level))) { - state::DateEnvironmentRAII DERAII(ident); - ++icv::Level; - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - // From this point forward we know that there is no thread state used. - ASSERT(state::HasThreadState == false, nullptr); - - if (mapping::isSPMDMode()) { - // This was moved to its own routine so it could be called directly - // in certain situations to avoid resource consumption of unused - // logic in parallel_51. - __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs); - - return; - } - - uint32_t NumThreads = determineNumberOfThreads(num_threads); - uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); - uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; - - // We do *not* create a new data environment because all threads in the team - // that are active are now running this parallel region. They share the - // TeamState, which has an increase level-var and potentially active-level - // set, but they do not have individual ThreadStates yet. If they ever - // modify the ICVs beyond this point a ThreadStates will be allocated. - - bool IsActiveParallelRegion = NumThreads > 1; - if (!IsActiveParallelRegion) { - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); - invokeMicrotask(TId, 0, fn, args, nargs); - return; - } - - void **GlobalArgs = nullptr; - if (nargs) { - __kmpc_begin_sharing_variables(&GlobalArgs, nargs); - switch (nargs) { - default: - for (int I = 0; I < nargs; I++) - GlobalArgs[I] = args[I]; - break; - case 16: - GlobalArgs[15] = args[15]; - [[fallthrough]]; - case 15: - GlobalArgs[14] = args[14]; - [[fallthrough]]; - case 14: - GlobalArgs[13] = args[13]; - [[fallthrough]]; - case 13: - GlobalArgs[12] = args[12]; - [[fallthrough]]; - case 12: - GlobalArgs[11] = args[11]; - [[fallthrough]]; - case 11: - GlobalArgs[10] = args[10]; - [[fallthrough]]; - case 10: - GlobalArgs[9] = args[9]; - [[fallthrough]]; - case 9: - GlobalArgs[8] = args[8]; - [[fallthrough]]; - case 8: - GlobalArgs[7] = args[7]; - [[fallthrough]]; - case 7: - GlobalArgs[6] = args[6]; - [[fallthrough]]; - case 6: - GlobalArgs[5] = args[5]; - [[fallthrough]]; - case 5: - GlobalArgs[4] = args[4]; - [[fallthrough]]; - case 4: - GlobalArgs[3] = args[3]; - [[fallthrough]]; - case 3: - GlobalArgs[2] = args[2]; - [[fallthrough]]; - case 2: - GlobalArgs[1] = args[1]; - [[fallthrough]]; - case 1: - GlobalArgs[0] = args[0]; - [[fallthrough]]; - case 0: - break; - } - } - - { - // Note that the order here is important. `icv::Level` has to be updated - // last or the other updates will cause a thread specific state to be - // created. - state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, - 1u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, - (void *)nullptr, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident, - /*ForceTeamState=*/true); - - // Master signals work to activate workers. - synchronize::threads(atomic::seq_cst); - // Master waits for workers to signal. - synchronize::threads(atomic::seq_cst); - } - - if (nargs) - __kmpc_end_sharing_variables(); -} - -[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { - // Work function and arguments for L1 parallel region. - *WorkFn = state::ParallelRegionFn; - - // If this is the termination signal from the master, quit early. - if (!*WorkFn) - return false; - - // Set to true for workers participating in the parallel region. - uint32_t TId = mapping::getThreadIdInBlock(); - bool ThreadIsActive = TId < state::getEffectivePTeamSize(); - return ThreadIsActive; -} - -[[clang::noinline]] void __kmpc_kernel_end_parallel() { - // In case we have modified an ICV for this thread before a ThreadState was - // created. We drop it now to not contaminate the next parallel region. - ASSERT(!mapping::isSPMDMode(), nullptr); - uint32_t TId = mapping::getThreadIdInBlock(); - state::resetStateForThread(TId); - ASSERT(!mapping::isSPMDMode(), nullptr); -} - -uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } - -int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } - -void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, - int32_t thread_limit) {} - -void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} -} diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp deleted file mode 100644 index df141af5ebee..000000000000 --- a/offload/DeviceRTL/src/Profiling.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===------- Profiling.cpp ---------------------------------------- C++ ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "Profiling.h" - -extern "C" { - -// Provides empty implementations for certain functions in compiler-rt -// that are emitted by the PGO instrumentation. -void __llvm_profile_register_function(void *Ptr) {} -void __llvm_profile_register_names_function(void *Ptr, long int I) {} -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} -} diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp deleted file mode 100644 index fffd0063940c..000000000000 --- a/offload/DeviceRTL/src/Reduction.cpp +++ /dev/null @@ -1,316 +0,0 @@ -//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of reduction with KMPC interface. -// -//===----------------------------------------------------------------------===// - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -namespace { - -void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { - for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { - shflFct(reduce_data, /*LaneId - not used= */ 0, - /*Offset = */ mask, /*AlgoVersion=*/0); - } -} - -void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, - uint32_t size, uint32_t tid) { - uint32_t curr_size; - uint32_t mask; - curr_size = size; - mask = curr_size / 2; - while (mask > 0) { - shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); - curr_size = (curr_size + 1) / 2; - mask = curr_size / 2; - } -} - -static uint32_t gpu_irregular_simd_reduce(void *reduce_data, - ShuffleReductFnTy shflFct) { - uint32_t size, remote_id, physical_lane_id; - physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); - __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); - __kmpc_impl_lanemask_t Liveness = mapping::activemask(); - uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; - __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); - do { - Liveness = mapping::activemask(); - remote_id = utils::ffs(Liveness & lanemask_gt); - size = utils::popc(Liveness); - logical_lane_id /= 2; - shflFct(reduce_data, /*LaneId =*/logical_lane_id, - /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); - } while (logical_lane_id % 2 == 0 && size > 1); - return (logical_lane_id == 0); -} - -static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { - uint32_t BlockThreadId = mapping::getThreadIdInBlock(); - if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false)) - BlockThreadId = 0; - uint32_t NumThreads = omp_get_num_threads(); - if (NumThreads == 1) - return 1; - - // - // This reduce function handles reduction within a team. It handles - // parallel regions in both L1 and L2 parallelism levels. It also - // supports Generic, SPMD, and NoOMP modes. - // - // 1. Reduce within a warp. - // 2. Warp master copies value to warp 0 via shared memory. - // 3. Warp 0 reduces to a single value. - // 4. The reduced value is available in the thread that returns 1. - // - -#if __has_builtin(__nvvm_reflect) - if (__nvvm_reflect("__CUDA_ARCH") >= 700) { - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - uint32_t WarpId = mapping::getWarpIdInBlock(); - - // Volta execution model: - // For the Generic execution mode a parallel region either has 1 thread and - // beyond that, always a multiple of 32. For the SPMD execution mode we may - // have any number of threads. - if ((NumThreads % mapping::getWarpSize() == 0) || - (WarpId < WarpsNeeded - 1)) - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce( - reduce_data, shflFct, - /*LaneCount=*/NumThreads % mapping::getWarpSize(), - /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - } - return BlockThreadId == 0; - } -#endif - __kmpc_impl_lanemask_t Liveness = mapping::activemask(); - if (Liveness == lanes::All) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/utils::popc(Liveness), - /*LaneId=*/mapping::getThreadIdInBlock() % - mapping::getWarpSize()); - else { // Dispersed lanes. Only threads in L2 - // parallel region may enter here; return - // early. - return gpu_irregular_simd_reduce(reduce_data, shflFct); - } - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - - return BlockThreadId == 0; - } - - // Get the OMP thread Id. This is different from BlockThreadId in the case - // of an L2 parallel region. - return BlockThreadId == 0; -} - -uint32_t roundToWarpsize(uint32_t s) { - if (s < mapping::getWarpSize()) - return 1; - return (s & ~(unsigned)(mapping::getWarpSize() - 1)); -} - -uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } - -} // namespace - -extern "C" { -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, - uint64_t reduce_data_size, - void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { - return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); -} - -int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, - uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, - ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) { - // Terminate all threads in non-SPMD mode except for the master thread. - uint32_t ThreadId = mapping::getThreadIdInBlock(); - if (mapping::isGenericMode()) { - if (!mapping::isMainThreadInGenericMode()) - return 0; - ThreadId = 0; - } - - uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt; - uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; - - // In non-generic mode all workers participate in the teams reduction. - // In generic mode only the team master participates in the teams - // reduction because the workers are waiting for parallel work. - uint32_t NumThreads = omp_get_num_threads(); - uint32_t TeamId = omp_get_team_num(); - uint32_t NumTeams = omp_get_num_teams(); - [[clang::loader_uninitialized]] static Local<unsigned> Bound; - [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount; - - // Block progress for teams greater than the current upper - // limit. We always only allow a number of teams less or equal - // to the number of slots in the buffer. - bool IsMaster = (ThreadId == 0); - while (IsMaster) { - Bound = atomic::load(&IterCnt, atomic::acquire); - if (TeamId < Bound + num_of_records) - break; - } - - if (IsMaster) { - int ModBockId = TeamId % num_of_records; - if (TeamId < num_of_records) { - lgcpyFct(GlobalBuffer, ModBockId, reduce_data); - } else - lgredFct(GlobalBuffer, ModBockId, reduce_data); - - // Propagate the memory writes above to the world. - fence::kernel(atomic::release); - - // Increment team counter. - // This counter is incremented by all teams in the current - // num_of_records chunk. - ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, - atomic::MemScopeTy::device); - } - - // Synchronize in SPMD mode as in generic mode all but 1 threads are in the - // state machine. - if (mapping::isSPMDMode()) - synchronize::threadsAligned(atomic::acq_rel); - - // reduce_data is global or shared so before being reduced within the - // warp we need to bring it in local memory: - // local_reduce_data = reduce_data[i] - // - // Example for 3 reduction variables a, b, c (of potentially different - // types): - // - // buffer layout (struct of arrays): - // a, a, ..., a, b, b, ... b, c, c, ... c - // |__________| - // num_of_records - // - // local_data_reduce layout (struct): - // a, b, c - // - // Each thread will have a local struct containing the values to be - // reduced: - // 1. do reduction within each warp. - // 2. do reduction across warps. - // 3. write the final result to the main reduction variable - // by returning 1 in the thread holding the reduction result. - - // Check if this is the very last team. - unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); - if (ChunkTeamCount == NumTeams - Bound - 1) { - // Ensure we see the global memory writes by other teams - fence::kernel(atomic::acquire); - - // - // Last team processing. - // - if (ThreadId >= NumRecs) - return 0; - NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); - if (ThreadId >= NumThreads) - return 0; - - // Load from buffer and reduce. - glcpyFct(GlobalBuffer, ThreadId, reduce_data); - for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) - glredFct(GlobalBuffer, i, reduce_data); - - // Reduce across warps to the warp master. - if (NumThreads > 1) { - gpu_regular_warp_reduce(reduce_data, shflFct); - - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); - if (ActiveThreads > mapping::getWarpSize()) { - uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = ThreadId / mapping::getWarpSize(); - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - ThreadId); - } - } - - if (IsMaster) { - Cnt = 0; - IterCnt = 0; - return 1; - } - return 0; - } - if (IsMaster && ChunkTeamCount == num_of_records - 1) { - // Allow SIZE number of teams to proceed writing their - // intermediate results to the global buffer. - atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst); - } - - return 0; -} -} - -void *__kmpc_reduction_get_fixed_buffer() { - return state::getKernelLaunchEnvironment().ReductionBuffer; -} diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp deleted file mode 100644 index 475395102f47..000000000000 --- a/offload/DeviceRTL/src/State.cpp +++ /dev/null @@ -1,482 +0,0 @@ -//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "Shared/Environment.h" - -#include "Allocator.h" -#include "Configuration.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "LibC.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -/// Memory implementation -/// -///{ - -/// External symbol to access dynamic shared memory. -[[gnu::aligned( - allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[]; - -/// The kernel environment passed to the init method by the compiler. -[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *> - KernelEnvironmentPtr; - -/// The kernel launch environment passed as argument to the kernel by the -/// runtime. -[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *> - KernelLaunchEnvironmentPtr; - -///} - -namespace { - -/// Fallback implementations are missing to trigger a link time error. -/// Implementations for new devices, including the host, should go into a -/// dedicated begin/end declare variant. -/// -///{ -extern "C" { -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) - -[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } -[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } - -#else - -[[gnu::weak, gnu::leaf]] void *malloc(size_t Size); -[[gnu::weak, gnu::leaf]] void free(void *Ptr); - -#endif -} -///} - -/// A "smart" stack in shared memory. -/// -/// The stack exposes a malloc/free interface but works like a stack internally. -/// In fact, it is a separate stack *per warp*. That means, each warp must push -/// and pop symmetrically or this breaks, badly. The implementation will (aim -/// to) detect non-lock-step warps and fallback to malloc/free. The same will -/// happen if a warp runs out of memory. The master warp in generic memory is -/// special and is given more memory than the rest. -/// -struct SharedMemorySmartStackTy { - /// Initialize the stack. Must be called by all threads. - void init(bool IsSPMD); - - /// Allocate \p Bytes on the stack for the encountering thread. Each thread - /// can call this function. - void *push(uint64_t Bytes); - - /// Deallocate the last allocation made by the encountering thread and pointed - /// to by \p Ptr from the stack. Each thread can call this function. - void pop(void *Ptr, uint64_t Bytes); - -private: - /// Compute the size of the storage space reserved for a thread. - uint32_t computeThreadStorageTotal() { - uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); - return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock, - allocator::ALIGNMENT); - } - - /// Return the top address of the warp data stack, that is the first address - /// this warp will allocate memory at next. - void *getThreadDataTop(uint32_t TId) { - return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; - } - - /// The actual storage, shared among all warps. - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize]; - [[gnu::aligned( - allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; -}; - -static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, - "Shared scratchpad of this size not supported yet."); - -/// The allocation of a single shared memory scratchpad. -[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy> - SharedMemorySmartStack; - -void SharedMemorySmartStackTy::init(bool IsSPMD) { - Usage[mapping::getThreadIdInBlock()] = 0; -} - -void *SharedMemorySmartStackTy::push(uint64_t Bytes) { - // First align the number of requested bytes. - /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to - /// be passed in as an argument and the stack rewritten to support it. - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - - uint32_t StorageTotal = computeThreadStorageTotal(); - - // The main thread in generic mode gets the space of its entire warp as the - // other threads do not participate in any computation at all. - if (mapping::isMainThreadInGenericMode()) - StorageTotal *= mapping::getWarpSize(); - - int TId = mapping::getThreadIdInBlock(); - if (Usage[TId] + AlignedBytes <= StorageTotal) { - void *Ptr = getThreadDataTop(TId); - Usage[TId] += AlignedBytes; - return Ptr; - } - - if (config::isDebugMode(DeviceDebugKind::CommonIssues)) - printf("Shared memory stack full, fallback to dynamic allocation of global " - "memory will negatively impact performance.\n"); - void *GlobalMemory = memory::allocGlobal( - AlignedBytes, "Slow path shared memory allocation, insufficient " - "shared memory stack memory!"); - ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!"); - - return GlobalMemory; -} - -void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { - uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); - if (utils::isSharedMemPtr(Ptr)) { - int TId = mapping::getThreadIdInBlock(); - Usage[TId] -= AlignedBytes; - return; - } - memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); -} - -} // namespace - -void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } - -void *memory::allocShared(uint64_t Bytes, const char *Reason) { - return SharedMemorySmartStack.push(Bytes); -} - -void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { - SharedMemorySmartStack.pop(Ptr, Bytes); -} - -void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { - void *Ptr = malloc(Bytes); - if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) - printf("nullptr returned by malloc!\n"); - return Ptr; -} - -void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } - -///} - -bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { - return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & - (ActiveLevelVar == Other.ActiveLevelVar) & - (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & - (RunSchedVar == Other.RunSchedVar) & - (RunSchedChunkVar == Other.RunSchedChunkVar); -} - -void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { - ASSERT(NThreadsVar == Other.NThreadsVar, nullptr); - ASSERT(LevelVar == Other.LevelVar, nullptr); - ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr); - ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr); - ASSERT(RunSchedVar == Other.RunSchedVar, nullptr); - ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr); -} - -void state::TeamStateTy::init(bool IsSPMD) { - ICVState.NThreadsVar = 0; - ICVState.LevelVar = 0; - ICVState.ActiveLevelVar = 0; - ICVState.Padding0Val = 0; - ICVState.MaxActiveLevelsVar = 1; - ICVState.RunSchedVar = omp_sched_static; - ICVState.RunSchedChunkVar = 1; - ParallelTeamSize = 1; - HasThreadState = false; - ParallelRegionFnVar = nullptr; -} - -bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { - return (ICVState == Other.ICVState) & - (HasThreadState == Other.HasThreadState) & - (ParallelTeamSize == Other.ParallelTeamSize); -} - -void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { - ICVState.assertEqual(Other.ICVState); - ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); - ASSERT(HasThreadState == Other.HasThreadState, nullptr); -} - -[[clang::loader_uninitialized]] Local<state::TeamStateTy> - ompx::state::TeamState; -[[clang::loader_uninitialized]] Local<state::ThreadStateTy **> - ompx::state::ThreadStates; - -namespace { - -int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, - int OutOfBoundsVal = -1) { - if (Level == 0) - return DefaultVal; - int LevelVar = omp_get_level(); - if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) - return OutOfBoundsVal; - int ActiveLevel = icv::ActiveLevel; - if (OMP_UNLIKELY(Level != ActiveLevel)) - return DefaultVal; - return Val; -} - -} // namespace - -void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { - SharedMemorySmartStack.init(IsSPMD); - if (mapping::isInitialThreadInLevel0(IsSPMD)) { - TeamState.init(IsSPMD); - ThreadStates = nullptr; - KernelEnvironmentPtr = &KernelEnvironment; - KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; - } -} - -KernelEnvironmentTy &state::getKernelEnvironment() { - return *KernelEnvironmentPtr; -} - -KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { - return *KernelLaunchEnvironmentPtr; -} - -void state::enterDataEnvironment(IdentTy *Ident) { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - if (!config::mayUseThreadStates()) - return; - - unsigned TId = mapping::getThreadIdInBlock(); - ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>( - memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc")); - uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates); - if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { - uint32_t Bytes = - sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock(); - void *ThreadStatesPtr = - memory::allocGlobal(Bytes, "Thread state array allocation"); - __builtin_memset(ThreadStatesPtr, 0, Bytes); - if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), - reinterpret_cast<uintptr_t>(ThreadStatesPtr), - atomic::seq_cst, atomic::seq_cst)) - memory::freeGlobal(ThreadStatesPtr, - "Thread state array allocated multiple times"); - ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst), - "Expected valid thread states bit!"); - } - NewThreadState->init(ThreadStates[TId]); - TeamState.HasThreadState = true; - ThreadStates[TId] = NewThreadState; -} - -void state::exitDataEnvironment() { - ASSERT(config::mayUseThreadStates(), - "Thread state modified while explicitly disabled!"); - - unsigned TId = mapping::getThreadIdInBlock(); - resetStateForThread(TId); -} - -void state::resetStateForThread(uint32_t TId) { - if (!config::mayUseThreadStates()) - return; - if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) - return; - - ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; - memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc"); - ThreadStates[TId] = PreviousThreadState; -} - -void state::runAndCheckState(void(Func(void))) { - TeamStateTy OldTeamState = TeamState; - OldTeamState.assertEqual(TeamState); - - Func(); - - OldTeamState.assertEqual(TeamState); -} - -void state::assumeInitialState(bool IsSPMD) { - TeamStateTy InitialTeamState; - InitialTeamState.init(IsSPMD); - InitialTeamState.assertEqual(TeamState); - ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr); -} - -int state::getEffectivePTeamSize() { - int PTeamSize = state::ParallelTeamSize; - return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); -} - -extern "C" { -void omp_set_dynamic(int V) {} - -int omp_get_dynamic(void) { return 0; } - -void omp_set_num_threads(int V) { icv::NThreads = V; } - -int omp_get_max_threads(void) { - int NT = icv::NThreads; - return NT > 0 ? NT : mapping::getMaxTeamThreads(); -} - -int omp_get_level(void) { - int LevelVar = icv::Level; - ASSERT(LevelVar >= 0, nullptr); - return LevelVar; -} - -int omp_get_active_level(void) { return !!icv::ActiveLevel; } - -int omp_in_parallel(void) { return !!icv::ActiveLevel; } - -void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { - *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); - *ChunkSize = state::RunSchedChunk; -} - -void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { - icv::RunSched = (int)ScheduleKind; - state::RunSchedChunk = ChunkSize; -} - -int omp_get_ancestor_thread_num(int Level) { - return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); -} - -int omp_get_thread_num(void) { - return omp_get_ancestor_thread_num(omp_get_level()); -} - -int omp_get_team_size(int Level) { - return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); -} - -int omp_get_num_threads(void) { - return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); -} - -int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } - -int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } - -void omp_set_nested(int) {} - -int omp_get_nested(void) { return false; } - -void omp_set_max_active_levels(int Levels) { - icv::MaxActiveLevels = Levels > 0 ? 1 : 0; -} - -int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } - -omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } - -int omp_get_num_places(void) { return 0; } - -int omp_get_place_num_procs(int) { return omp_get_num_procs(); } - -void omp_get_place_proc_ids(int, int *) { - // TODO -} - -int omp_get_place_num(void) { return 0; } - -int omp_get_partition_num_places(void) { return 0; } - -void omp_get_partition_place_nums(int *) { - // TODO -} - -int omp_get_cancellation(void) { return 0; } - -void omp_set_default_device(int) {} - -int omp_get_default_device(void) { return -1; } - -int omp_get_num_devices(void) { return config::getNumDevices(); } - -int omp_get_device_num(void) { return config::getDeviceNum(); } - -int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } - -int omp_get_team_num() { return mapping::getBlockIdInKernel(); } - -int omp_get_initial_device(void) { return -1; } - -int omp_is_initial_device(void) { return 0; } -} - -extern "C" { -[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) { - return memory::allocShared(Bytes, "Frontend alloc shared"); -} - -[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { - memory::freeShared(Ptr, Bytes, "Frontend free shared"); -} - -void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } - -void *llvm_omp_target_dynamic_shared_alloc() { - return __kmpc_get_dynamic_shared(); -} - -void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } - -/// Allocate storage in shared memory to communicate arguments from the main -/// thread to the workers in generic mode. If we exceed -/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. -constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; - -[[clang::loader_uninitialized]] static Local<void *> - SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; -[[clang::loader_uninitialized]] static Local<void **> - SharedMemVariableSharingSpacePtr; - -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { - if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { - SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; - } else { - SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( - nArgs * sizeof(void *), "new extended args"); - ASSERT(SharedMemVariableSharingSpacePtr != nullptr, - "Nullptr returned by malloc!"); - } - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} - -void __kmpc_end_sharing_variables() { - if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) - memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); -} - -void __kmpc_get_shared_variables(void ***GlobalArgs) { - *GlobalArgs = SharedMemVariableSharingSpacePtr; -} -} diff --git a/offload/DeviceRTL/src/Stub.cpp b/offload/DeviceRTL/src/Stub.cpp deleted file mode 100644 index e833423eb265..000000000000 --- a/offload/DeviceRTL/src/Stub.cpp +++ /dev/null @@ -1 +0,0 @@ -// This is an empty file used to create a device fatbinary. diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp deleted file mode 100644 index 2f1ed34a3f6d..000000000000 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ /dev/null @@ -1,379 +0,0 @@ -//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Include all synchronization. -// -//===----------------------------------------------------------------------===// - -#include "Synchronization.h" - -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" - -using namespace ompx; - -namespace impl { - -/// Atomics -/// -///{ -///} - -/// AMDGCN Implementation -/// -///{ -#ifdef __AMDGPU__ - -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - // builtin_amdgcn_atomic_inc32 should expand to this switch when - // passed a runtime value, but does not do so yet. Workaround here. - -#define ScopeSwitch(ORDER) \ - switch (MemScope) { \ - case atomic::MemScopeTy::system: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \ - case atomic::MemScopeTy::device: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \ - case atomic::MemScopeTy::workgroup: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \ - case atomic::MemScopeTy::wavefront: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "wavefront"); \ - case atomic::MemScopeTy::single: \ - return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "singlethread"); \ - } - -#define Case(ORDER) \ - case ORDER: \ - ScopeSwitch(ORDER) - - switch (Ordering) { - default: - __builtin_unreachable(); - Case(atomic::relaxed); - Case(atomic::acquire); - Case(atomic::release); - Case(atomic::acq_rel); - Case(atomic::seq_cst); -#undef Case -#undef ScopeSwitch - } -} - -[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker; - -void namedBarrierInit() { - // Don't have global ctors, and shared memory is not zero init - atomic::store(&namedBarrierTracker, 0u, atomic::release); -} - -void namedBarrier() { - uint32_t NumThreads = omp_get_num_threads(); - // assert(NumThreads % 32 == 0); - - uint32_t WarpSize = mapping::getWarpSize(); - uint32_t NumWaves = NumThreads / WarpSize; - - fence::team(atomic::acquire); - - // named barrier implementation for amdgcn. - // Uses two 16 bit unsigned counters. One for the number of waves to have - // reached the barrier, and one to count how many times the barrier has been - // passed. These are packed in a single atomically accessed 32 bit integer. - // Low bits for the number of waves, assumed zero before this call. - // High bits to count the number of times the barrier has been passed. - - // precondition: NumWaves != 0; - // invariant: NumWaves * WarpSize == NumThreads; - // precondition: NumWaves < 0xffffu; - - // Increment the low 16 bits once, using the lowest active thread. - if (mapping::isLeaderInWarp()) { - uint32_t load = atomic::add(&namedBarrierTracker, 1, - atomic::relaxed); // commutative - - // Record the number of times the barrier has been passed - uint32_t generation = load & 0xffff0000u; - - if ((load & 0x0000ffffu) == (NumWaves - 1)) { - // Reached NumWaves in low bits so this is the last wave. - // Set low bits to zero and increment high bits - load += 0x00010000u; // wrap is safe - load &= 0xffff0000u; // because bits zeroed second - - // Reset the wave counter and release the waiting waves - atomic::store(&namedBarrierTracker, load, atomic::relaxed); - } else { - // more waves still to go, spin until generation counter changes - do { - __builtin_amdgcn_s_sleep(0); - load = atomic::load(&namedBarrierTracker, atomic::relaxed); - } while ((load & 0xffff0000u) == generation); - } - } - fence::team(atomic::release); -} - -void fenceTeam(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::workgroup); -} - -void fenceKernel(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::device); -} - -void fenceSystem(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::system); -} - -void syncWarp(__kmpc_impl_lanemask_t) { - // This is a no-op on current AMDGPU hardware but it is used by the optimizer - // to enforce convergent behaviour between control flow graphs. - __builtin_amdgcn_wave_barrier(); -} - -void syncThreads(atomic::OrderingTy Ordering) { - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst); - - __builtin_amdgcn_s_barrier(); - - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst); -} -void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } - -// TODO: Don't have wavefront lane locks. Possibly can't have them. -void unsetLock(omp_lock_t *) { __builtin_trap(); } -int testLock(omp_lock_t *) { __builtin_trap(); } -void initLock(omp_lock_t *) { __builtin_trap(); } -void destroyLock(omp_lock_t *) { __builtin_trap(); } -void setLock(omp_lock_t *) { __builtin_trap(); } - -constexpr uint32_t UNSET = 0; -constexpr uint32_t SET = 1; - -void unsetCriticalLock(omp_lock_t *Lock) { - (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel); -} - -void setCriticalLock(omp_lock_t *Lock) { - uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1; - if (mapping::getThreadIdInWarp() == LowestActiveThread) { - fenceKernel(atomic::release); - while ( - !cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) { - __builtin_amdgcn_s_sleep(32); - } - fenceKernel(atomic::acquire); - } -} - -#endif -///} - -/// NVPTX Implementation -/// -///{ -#ifdef __NVPTX__ - -uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - return __nvvm_atom_inc_gen_ui(Address, Val); -} - -void namedBarrierInit() {} - -void namedBarrier() { - uint32_t NumThreads = omp_get_num_threads(); - ASSERT(NumThreads % 32 == 0, nullptr); - - // The named barrier for active parallel threads of a team in an L1 parallel - // region to synchronize with each other. - constexpr int BarrierNo = 7; - __nvvm_barrier_sync_cnt(BarrierNo, NumThreads); -} - -void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); } - -void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); } - -void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); } - -void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } - -void syncThreads(atomic::OrderingTy Ordering) { - constexpr int BarrierNo = 8; - __nvvm_barrier_sync(BarrierNo); -} - -void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); } - -constexpr uint32_t OMP_SPIN = 1000; -constexpr uint32_t UNSET = 0; -constexpr uint32_t SET = 1; - -// TODO: This seems to hide a bug in the declare variant handling. If it is -// called before it is defined -// here the overload won't happen. Investigate lalter! -void unsetLock(omp_lock_t *Lock) { - (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst); -} - -int testLock(omp_lock_t *Lock) { - return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst); -} - -void initLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void setLock(omp_lock_t *Lock) { - // TODO: not sure spinning is a good idea here.. - while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, - atomic::seq_cst) != UNSET) { - int32_t start = __nvvm_read_ptx_sreg_clock(); - int32_t now; - for (;;) { - now = __nvvm_read_ptx_sreg_clock(); - int32_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) { - break; - } - } - } // wait for 0 to be the read value -} - -void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); } - -void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } - -#endif -///} - -} // namespace impl - -void synchronize::init(bool IsSPMD) { - if (!IsSPMD) - impl::namedBarrierInit(); -} - -void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } - -void synchronize::threads(atomic::OrderingTy Ordering) { - impl::syncThreads(Ordering); -} - -void synchronize::threadsAligned(atomic::OrderingTy Ordering) { - impl::syncThreadsAligned(Ordering); -} - -void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } - -void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); } - -void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } - -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { - return impl::atomicInc(Addr, V, Ordering, MemScope); -} - -void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); } - -void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); } - -extern "C" { -void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} - -void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { - __kmpc_barrier(Loc, TId); - return 0; -} - -void __kmpc_barrier(IdentTy *Loc, int32_t TId) { - if (mapping::isSPMDMode()) - return __kmpc_barrier_simple_spmd(Loc, TId); - - // Generic parallel regions are run with multiple of the warp size or single - // threaded, in the latter case we need to stop here. - if (omp_get_num_threads() == 1) - return __kmpc_flush(Loc); - - impl::namedBarrier(); -} - -[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { - synchronize::threadsAligned(atomic::OrderingTy::seq_cst); -} - -[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc, - int32_t TId) { - synchronize::threads(atomic::OrderingTy::seq_cst); -} - -int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { - return omp_get_thread_num() == 0; -} - -void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) { - return omp_get_thread_num() == Filter; -} - -void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {} - -int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { - return __kmpc_master(Loc, TId); -} - -void __kmpc_end_single(IdentTy *Loc, int32_t TId) { - // The barrier is explicitly called. -} - -void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); } - -uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); } - -void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); } - -void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { - impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); -} - -void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { - impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); -} - -void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } - -void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } - -void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } - -void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } - -int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } - -void ompx_sync_block(int Ordering) { - impl::syncThreadsAligned(atomic::OrderingTy(Ordering)); -} -void ompx_sync_block_acq_rel() { - impl::syncThreadsAligned(atomic::OrderingTy::acq_rel); -} -void ompx_sync_block_divergent(int Ordering) { - impl::syncThreads(atomic::OrderingTy(Ordering)); -} -} // extern "C" diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp deleted file mode 100644 index d0be0ace50df..000000000000 --- a/offload/DeviceRTL/src/Tasking.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Task implementation support. -// -// TODO: We should not allocate and execute the task in two steps. A new API is -// needed for that though. -// -//===----------------------------------------------------------------------===// - -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "State.h" - -using namespace ompx; - -extern "C" { - -TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, - size_t TaskSizeInclPrivateValues, - size_t SharedValuesSize, - TaskFnTy TaskFn) { - auto TaskSizeInclPrivateValuesPadded = - utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *)); - auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize; - TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( - TaskSizeTotal, "explicit task descriptor"); - TaskDescriptor->Payload = - utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded); - TaskDescriptor->TaskFn = TaskFn; - - return TaskDescriptor; -} - -int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); -} - -int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int32_t, - void *, int32_t, void *) { - state::DateEnvironmentRAII DERAII(Loc); - - TaskDescriptor->TaskFn(0, TaskDescriptor); - - memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); - return 0; -} - -void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - state::enterDataEnvironment(Loc); -} - -void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { - state::exitDataEnvironment(); - - memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); -} - -void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, - void *) {} - -void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {} - -void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {} - -int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; } - -int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; } - -void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int, - uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, - int32_t, uint64_t, void *) { - // Skip task entirely if empty iteration space. - if (*LowerBound > *UpperBound) - return; - - // The compiler has already stored lb and ub in the TaskDescriptorTy structure - // as we are using a single task to execute the entire loop, we can leave - // the initial task_t untouched - __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); -} - -int omp_in_final(void) { - // treat all tasks as final... Specs may expect runtime to keep - // track more precisely if a task was actively set by users... This - // is not explicitly specified; will treat as if runtime can - // actively decide to put a non-final task into a final one. - return 1; -} - -int omp_get_max_task_priority(void) { return 0; } -} diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp deleted file mode 100644 index a8759307b42b..000000000000 --- a/offload/DeviceRTL/src/Workshare.cpp +++ /dev/null @@ -1,935 +0,0 @@ -//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of the KMPC interface -// for the loop construct plus other worksharing constructs that use the same -// interface as loops. -// -//===----------------------------------------------------------------------===// - -#include "Workshare.h" -#include "Debug.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#include "Interface.h" -#include "Mapping.h" -#include "State.h" -#include "Synchronization.h" - -using namespace ompx; - -// TODO: -struct DynamicScheduleTracker { - int64_t Chunk; - int64_t LoopUpperBound; - int64_t NextLowerBound; - int64_t Stride; - kmp_sched_t ScheduleType; - DynamicScheduleTracker *NextDST; -}; - -#define ASSERT0(...) - -// used by the library for the interface with the app -#define DISPATCH_FINISHED 0 -#define DISPATCH_NOTFINISHED 1 - -// used by dynamic scheduling -#define FINISHED 0 -#define NOT_FINISHED 1 -#define LAST_CHUNK 2 - -// TODO: This variable is a hack inherited from the old runtime. -[[clang::loader_uninitialized]] static Local<uint64_t> Cnt; - -template <typename T, typename ST> struct omptarget_nvptx_LoopSupport { - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling with chunk - - // Generic implementation of OMP loop scheduling with static policy - /*! \brief Calculate initial bounds for static loop and stride - * @param[in] loc location in code of the call (not used here) - * @param[in] global_tid global thread id - * @param[in] schetype type of scheduling (see omptarget-nvptx.h) - * @param[in] plastiter pointer to last iteration - * @param[in,out] pointer to loop lower bound. it will contain value of - * lower bound of first chunk - * @param[in,out] pointer to loop upper bound. It will contain value of - * upper bound of first chunk - * @param[in,out] pointer to loop stride. It will contain value of stride - * between two successive chunks executed by the same thread - * @param[in] loop increment bump - * @param[in] chunk size - */ - - // helper function for static chunk - static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, - T entityId, T numberOfEntities) { - // each thread executes multiple chunks all of the same size, except - // the last one - // distance between two successive chunks - stride = numberOfEntities * chunk; - lb = lb + entityId * chunk; - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - // Say ub' is the beginning of the last chunk. Then who ever has a - // lower bound plus a multiple of the increment equal to ub' is - // the last one. - T beginingLastChunk = inputUb - (inputUb % chunk); - last = ((beginingLastChunk - lb) % stride) == 0; - } - - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling without chunk - - // helper function for static no chunk - static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, - T entityId, T numberOfEntities) { - // No chunk size specified. Each thread or warp gets at most one - // chunk; chunks are all almost of equal size - T loopSize = ub - lb + 1; - - chunk = loopSize / numberOfEntities; - T leftOver = loopSize - chunk * numberOfEntities; - - if (entityId < leftOver) { - chunk++; - lb = lb + entityId * chunk; - } else { - lb = lb + entityId * chunk + leftOver; - } - - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - last = lb <= inputUb && inputUb <= ub; - stride = loopSize; // make sure we only do 1 chunk per warp - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for Static Init - - static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter, - T *plower, T *pupper, ST *pstride, ST chunk, - bool IsSPMDExecutionMode) { - int32_t gtid = omp_get_thread_num(); - int numberOfActiveOMPThreads = omp_get_num_threads(); - - // All warps that are in excess of the maximum requested, do - // not execute the loop - ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, - "current thread is not needed here; error"); - - // copy - int lastiter = 0; - T lb = *plower; - T ub = *pupper; - ST stride = *pstride; - - // init - switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { - case kmp_sched_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_static_balanced_chunk: { - if (chunk > 0) { - // round up to make sure the chunk is enough to cover all iterations - T tripCount = ub - lb + 1; // +1 because ub is inclusive - T span = (tripCount + numberOfActiveOMPThreads - 1) / - numberOfActiveOMPThreads; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - if (ub > oldUb) - ub = oldUb; - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - case kmp_sched_distr_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), - omp_get_num_teams()); - break; - } - [[fallthrough]]; - } // note: if chunk <=0, use nochunk - case kmp_sched_distr_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), - omp_get_num_teams()); - break; - } - case kmp_sched_distr_static_chunk_sched_static_chunkone: { - ForStaticChunk(lastiter, lb, ub, stride, chunk, - numberOfActiveOMPThreads * omp_get_team_num() + gtid, - omp_get_num_teams() * numberOfActiveOMPThreads); - break; - } - default: { - // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - } - // copy back - *plastiter = lastiter; - *plower = lb; - *pupper = ub; - *pstride = stride; - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch Init - - static int OrderedSchedule(kmp_sched_t schedule) { - return schedule >= kmp_sched_ordered_first && - schedule <= kmp_sched_ordered_last; - } - - static void dispatch_init(IdentTy *loc, int32_t threadId, - kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, - DynamicScheduleTracker *DST) { - int tid = mapping::getThreadIdInBlock(); - T tnum = omp_get_num_threads(); - T tripCount = ub - lb + 1; // +1 because ub is inclusive - ASSERT0(LT_FUSSY, threadId < tnum, - "current thread is not needed here; error"); - - /* Currently just ignore the monotonic and non-monotonic modifiers - * (the compiler isn't producing them * yet anyway). - * When it is we'll want to look at them somewhere here and use that - * information to add to our schedule choice. We shouldn't need to pass - * them on, they merely affect which schedule we can legally choose for - * various dynamic cases. (In particular, whether or not a stealing scheme - * is legal). - */ - schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); - - // Process schedule. - if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { - if (OrderedSchedule(schedule)) - __kmpc_barrier(loc, threadId); - schedule = kmp_sched_static_chunk; - chunk = tripCount; // one thread gets the whole loop - } else if (schedule == kmp_sched_runtime) { - // process runtime - omp_sched_t rtSched; - int ChunkInt; - omp_get_schedule(&rtSched, &ChunkInt); - chunk = ChunkInt; - switch (rtSched) { - case omp_sched_static: { - if (chunk > 0) - schedule = kmp_sched_static_chunk; - else - schedule = kmp_sched_static_nochunk; - break; - } - case omp_sched_auto: { - schedule = kmp_sched_static_chunk; - chunk = 1; - break; - } - case omp_sched_dynamic: - case omp_sched_guided: { - schedule = kmp_sched_dynamic; - break; - } - } - } else if (schedule == kmp_sched_auto) { - schedule = kmp_sched_static_chunk; - chunk = 1; - } else { - // ASSERT(LT_FUSSY, - // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - // "unknown schedule %d & chunk %lld\n", (int)schedule, - // (long long)chunk); - } - - // init schedules - if (schedule == kmp_sched_static_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_static_balanced_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - // round up to make sure the chunk is enough to cover all iterations - T span = (tripCount + tnum - 1) / tnum; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - if (ub > oldUb) - ub = oldUb; - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_static_nochunk) { - ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); - // save sched state - DST->ScheduleType = schedule; - // save ub - DST->LoopUpperBound = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - DST->Chunk = chunk; - DST->NextLowerBound = lb; - DST->Stride = stride; - } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { - // save data - DST->ScheduleType = schedule; - if (chunk < 1) - chunk = 1; - DST->Chunk = chunk; - DST->LoopUpperBound = ub; - DST->NextLowerBound = lb; - __kmpc_barrier(loc, threadId); - if (tid == 0) { - Cnt = 0; - fence::team(atomic::seq_cst); - } - __kmpc_barrier(loc, threadId); - } - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch next - - static uint64_t NextIter() { - __kmpc_impl_lanemask_t active = mapping::activemask(); - uint32_t leader = utils::ffs(active) - 1; - uint32_t change = utils::popc(active); - __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT(); - unsigned int rank = utils::popc(active & lane_mask_lt); - uint64_t warp_res = 0; - if (rank == 0) { - warp_res = atomic::add(&Cnt, change, atomic::seq_cst); - } - warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize()); - return warp_res + rank; - } - - static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, - T loopUpperBound) { - T N = NextIter(); - lb = loopLowerBound + N * chunkSize; - ub = lb + chunkSize - 1; // Clang uses i <= ub - - // 3 result cases: - // a. lb and ub < loopUpperBound --> NOT_FINISHED - // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> - // NOT_FINISHED - // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED - // a. - if (lb <= loopUpperBound && ub < loopUpperBound) { - return NOT_FINISHED; - } - // b. - if (lb <= loopUpperBound) { - ub = loopUpperBound; - return LAST_CHUNK; - } - // c. if we are here, we are in case 'c' - lb = loopUpperBound + 2; - ub = loopUpperBound + 1; - return FINISHED; - } - - static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, - T *plower, T *pupper, ST *pstride, - DynamicScheduleTracker *DST) { - // ID of a thread in its own warp - - // automatically selects thread or warp ID based on selected implementation - ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), - "current thread is not needed here; error"); - // retrieve schedule - kmp_sched_t schedule = DST->ScheduleType; - - // xxx reduce to one - if (schedule == kmp_sched_static_chunk || - schedule == kmp_sched_static_nochunk) { - T myLb = DST->NextLowerBound; - T ub = DST->LoopUpperBound; - // finished? - if (myLb > ub) { - return DISPATCH_FINISHED; - } - // not finished, save current bounds - ST chunk = DST->Chunk; - *plower = myLb; - T myUb = myLb + chunk - 1; // Clang uses i <= ub - if (myUb > ub) - myUb = ub; - *pupper = myUb; - *plast = (int32_t)(myUb == ub); - - // increment next lower bound by the stride - ST stride = DST->Stride; - DST->NextLowerBound = myLb + stride; - return DISPATCH_NOTFINISHED; - } - ASSERT0(LT_FUSSY, - schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - "bad sched"); - T myLb, myUb; - int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound, - DST->LoopUpperBound); - - if (finished == FINISHED) - return DISPATCH_FINISHED; - - // not finished (either not finished or last chunk) - *plast = (int32_t)(finished == LAST_CHUNK); - *plower = myLb; - *pupper = myUb; - *pstride = 1; - - return DISPATCH_NOTFINISHED; - } - - static void dispatch_fini() { - // nothing - } - - //////////////////////////////////////////////////////////////////////////////// - // end of template class that encapsulate all the helper functions - //////////////////////////////////////////////////////////////////////////////// -}; - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (dyn loops) -//////////////////////////////////////////////////////////////////////////////// - -// TODO: Expand the dispatch API to take a DST pointer which can then be -// allocated properly without malloc. -// For now, each team will contain an LDS pointer (ThreadDST) to a global array -// of references to the DST structs allocated (in global memory) for each thread -// in the team. The global memory array is allocated during the init phase if it -// was not allocated already and will be deallocated when the dispatch phase -// ends: -// -// __kmpc_dispatch_init -// -// ** Dispatch loop ** -// -// __kmpc_dispatch_deinit -// -[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **> - ThreadDST; - -// Create a new DST, link the current one, and define the new as current. -static DynamicScheduleTracker *pushDST() { - int32_t ThreadIndex = mapping::getThreadIdInBlock(); - // Each block will allocate an array of pointers to DST structs. The array is - // equal in length to the number of threads in that block. - if (!ThreadDST) { - // Allocate global memory array of pointers to DST structs: - if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0) - ThreadDST = static_cast<DynamicScheduleTracker **>( - memory::allocGlobal(mapping::getNumberOfThreadsInBlock() * - sizeof(DynamicScheduleTracker *), - "new ThreadDST array")); - synchronize::threads(atomic::seq_cst); - - // Initialize the array pointers: - ThreadDST[ThreadIndex] = nullptr; - } - - // Create a DST struct for the current thread: - DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>( - memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST")); - *NewDST = DynamicScheduleTracker({0}); - - // Add the new DST struct to the array of DST structs: - NewDST->NextDST = ThreadDST[ThreadIndex]; - ThreadDST[ThreadIndex] = NewDST; - return NewDST; -} - -// Return the current DST. -static DynamicScheduleTracker *peekDST() { - return ThreadDST[mapping::getThreadIdInBlock()]; -} - -// Pop the current DST and restore the last one. -static void popDST() { - int32_t ThreadIndex = mapping::getThreadIdInBlock(); - DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex]; - DynamicScheduleTracker *OldDST = CurrentDST->NextDST; - memory::freeGlobal(CurrentDST, "remove DST"); - ThreadDST[ThreadIndex] = OldDST; - - // Check if we need to deallocate the global array. Ensure all threads - // in the block have finished deallocating the individual DSTs. - synchronize::threads(atomic::seq_cst); - if (!ThreadDST[ThreadIndex] && !ThreadIndex) { - memory::freeGlobal(ThreadDST, "remove ThreadDST array"); - ThreadDST = nullptr; - } - synchronize::threads(atomic::seq_cst); -} - -void workshare::init(bool IsSPMD) { - if (mapping::isInitialThreadInLevel0(IsSPMD)) - ThreadDST = nullptr; -} - -extern "C" { - -// init -void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, - int32_t lb, int32_t ub, int32_t st, int32_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, - uint32_t lb, uint32_t ub, int32_t st, - int32_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, - int64_t lb, int64_t ub, int64_t st, int64_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, - uint64_t lb, uint64_t ub, int64_t st, - int64_t chunk) { - DynamicScheduleTracker *DST = pushDST(); - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); -} - -// next -int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, - int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, - int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { - DynamicScheduleTracker *DST = peekDST(); - return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st, DST); -} - -// fini -void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini(); -} - -void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini(); -} - -// deinit -void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (static loops) -//////////////////////////////////////////////////////////////////////////////// - -void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - mapping::isSPMDMode()); -} - -void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} - -void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} -} - -namespace ompx { - -/// Helper class to hide the generic loop nest and provide the template argument -/// throughout. -template <typename Ty> class StaticLoopChunker { - - /// Generic loop nest that handles block and/or thread distribution in the - /// absence of user specified chunk sizes. This implicitly picks a block chunk - /// size equal to the number of threads in the block and a thread chunk size - /// equal to one. In contrast to the chunked version we can get away with a - /// single loop in this case - static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, - Ty NumBlocks, Ty BId, Ty NumThreads, - Ty TId, Ty NumIters, - bool OneIterationPerThread) { - Ty KernelIteration = NumBlocks * NumThreads; - - // Start index in the normalized space. - Ty IV = BId * NumThreads + TId; - ASSERT(IV >= 0, "Bad index"); - - // Cover the entire iteration space, assumptions in the caller might allow - // to simplify this loop to a conditional. - if (IV < NumIters) { - do { - - // Execute the loop body. - LoopBody(IV, Arg); - - // Every thread executed one block and thread chunk now. - IV += KernelIteration; - - if (OneIterationPerThread) - return; - - } while (IV < NumIters); - } - } - - /// Generic loop nest that handles block and/or thread distribution in the - /// presence of user specified chunk sizes (for at least one of them). - static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg, - Ty BlockChunk, Ty NumBlocks, Ty BId, - Ty ThreadChunk, Ty NumThreads, Ty TId, - Ty NumIters, - bool OneIterationPerThread) { - Ty KernelIteration = NumBlocks * BlockChunk; - - // Start index in the chunked space. - Ty IV = BId * BlockChunk + TId; - ASSERT(IV >= 0, "Bad index"); - - // Cover the entire iteration space, assumptions in the caller might allow - // to simplify this loop to a conditional. - do { - - Ty BlockChunkLeft = - BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0; - Ty ThreadChunkLeft = - ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft; - - while (ThreadChunkLeft--) { - - // Given the blocking it's hard to keep track of what to execute. - if (IV >= NumIters) - return; - - // Execute the loop body. - LoopBody(IV, Arg); - - if (OneIterationPerThread) - return; - - ++IV; - } - - IV += KernelIteration; - - } while (IV < NumIters); - } - -public: - /// Worksharing `for`-loop. - static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty NumThreads, Ty ThreadChunk) { - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(ThreadChunk >= 0, "Bad thread count"); - - // All threads need to participate but we don't know if we are in a - // parallel at all or if the user might have used a `num_threads` clause - // on the parallel and reduced the number compared to the block size. - // Since nested parallels are possible too we need to get the thread id - // from the `omp` getter and not the mapping directly. - Ty TId = omp_get_thread_num(); - - // There are no blocks involved here. - Ty BlockChunk = 0; - Ty NumBlocks = 1; - Ty BId = 0; - - // If the thread chunk is not specified we pick a default now. - if (ThreadChunk == 0) - ThreadChunk = 1; - - // If we know we have more threads than iterations we can indicate that to - // avoid an outer loop. - bool OneIterationPerThread = false; - if (config::getAssumeThreadsOversubscription()) { - ASSERT(NumThreads >= NumIters, "Broken assumption"); - OneIterationPerThread = true; - } - - if (ThreadChunk != 1) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - } - - /// Worksharing `distribute`-loop. - static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty BlockChunk) { - ASSERT(icv::Level == 0, "Bad distribute"); - ASSERT(icv::ActiveLevel == 0, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); - - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(BlockChunk >= 0, "Bad block count"); - - // There are no threads involved here. - Ty ThreadChunk = 0; - Ty NumThreads = 1; - Ty TId = 0; - - // All teams need to participate. - Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); - Ty BId = mapping::getBlockIdInKernel(); - - // If the block chunk is not specified we pick a default now. - if (BlockChunk == 0) - BlockChunk = NumThreads; - - // If we know we have more blocks than iterations we can indicate that to - // avoid an outer loop. - bool OneIterationPerThread = false; - if (config::getAssumeTeamsOversubscription()) { - ASSERT(NumBlocks >= NumIters, "Broken assumption"); - OneIterationPerThread = true; - } - - if (BlockChunk != NumThreads) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - - ASSERT(icv::Level == 0, "Bad distribute"); - ASSERT(icv::ActiveLevel == 0, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); - } - - /// Worksharing `distribute parallel for`-loop. - static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), - void *Arg, Ty NumIters, Ty NumThreads, - Ty BlockChunk, Ty ThreadChunk) { - ASSERT(icv::Level == 1, "Bad distribute"); - ASSERT(icv::ActiveLevel == 1, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - - ASSERT(NumIters >= 0, "Bad iteration count"); - ASSERT(BlockChunk >= 0, "Bad block count"); - ASSERT(ThreadChunk >= 0, "Bad thread count"); - - // All threads need to participate but the user might have used a - // `num_threads` clause on the parallel and reduced the number compared to - // the block size. - Ty TId = mapping::getThreadIdInBlock(); - - // All teams need to participate. - Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); - Ty BId = mapping::getBlockIdInKernel(); - - // If the block chunk is not specified we pick a default now. - if (BlockChunk == 0) - BlockChunk = NumThreads; - - // If the thread chunk is not specified we pick a default now. - if (ThreadChunk == 0) - ThreadChunk = 1; - - // If we know we have more threads (across all blocks) than iterations we - // can indicate that to avoid an outer loop. - bool OneIterationPerThread = false; - if (config::getAssumeTeamsOversubscription() & - config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); - } - - if (BlockChunk != NumThreads || ThreadChunk != 1) - NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, - ThreadChunk, NumThreads, TId, NumIters, - OneIterationPerThread); - else - NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, - NumIters, OneIterationPerThread); - - ASSERT(icv::Level == 1, "Bad distribute"); - ASSERT(icv::ActiveLevel == 1, "Bad distribute"); - ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); - } -}; - -} // namespace ompx - -#define OMP_LOOP_ENTRY(BW, TY) \ - [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY block_chunk, TY thread_chunk) { \ - ompx::StaticLoopChunker<TY>::DistributeFor( \ - loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \ - } \ - [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ - void *arg, TY num_iters, \ - TY block_chunk) { \ - ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters, \ - block_chunk); \ - } \ - [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY thread_chunk) { \ - ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \ - thread_chunk); \ - } - -extern "C" { -OMP_LOOP_ENTRY(_4, int32_t) -OMP_LOOP_ENTRY(_4u, uint32_t) -OMP_LOOP_ENTRY(_8, int64_t) -OMP_LOOP_ENTRY(_8u, uint64_t) -} diff --git a/offload/cmake/OpenMPTesting.cmake b/offload/cmake/OpenMPTesting.cmake index 8e955ff39927..ef8cf34ba0c8 100644 --- a/offload/cmake/OpenMPTesting.cmake +++ b/offload/cmake/OpenMPTesting.cmake @@ -57,7 +57,7 @@ if (${OPENMP_STANDALONE_BUILD}) if (MSVC OR XCODE) set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --no-progress-bar") endif() - if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") + if ("${CMAKE_SYSTEM_NAME}" MATCHES "AIX") set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --time-tests --timeout=1800") endif() set(OPENMP_LIT_ARGS "${DEFAULT_LIT_ARGS}" CACHE STRING "Options for lit.") diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake index 0236f5f0b698..5a27a81c736b 100644 --- a/offload/cmake/caches/AMDGPUBot.cmake +++ b/offload/cmake/caches/AMDGPUBot.cmake @@ -15,7 +15,10 @@ set(LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;offload;flang-rt" CACHE STRING "") set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "") set(LLVM_TARGETS_TO_BUILD "host;AMDGPU;SPIRV" CACHE STRING "") -set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "") +set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 16" CACHE STRING "") set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "") set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "") + +set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "openmp" CACHE STRING "") diff --git a/offload/cmake/caches/AMDGPULibcBot.cmake b/offload/cmake/caches/AMDGPULibcBot.cmake index a772043c7966..798f080a41ad 100644 --- a/offload/cmake/caches/AMDGPULibcBot.cmake +++ b/offload/cmake/caches/AMDGPULibcBot.cmake @@ -17,5 +17,6 @@ set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "") set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "") set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "") -set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc" CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;libc;libcxxabi;libcxx" CACHE STRING "") set(RUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_TEST_JOBS 4 CACHE STRING "") diff --git a/offload/cmake/caches/Offload.cmake b/offload/cmake/caches/Offload.cmake index 5533a6508f5d..3747a1d3eb29 100644 --- a/offload/cmake/caches/Offload.cmake +++ b/offload/cmake/caches/Offload.cmake @@ -5,5 +5,5 @@ set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda CACHE STRING "") set(RUNTIMES_nvptx64-nvidia-cuda_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/NVPTX.cmake" CACHE STRING "") set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "") -set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "") -set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "") +set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "") +set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "") diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h index b9f5c1658293..45bd9c6e7da8 100644 --- a/offload/include/OpenMP/Mapping.h +++ b/offload/include/OpenMP/Mapping.h @@ -49,9 +49,46 @@ public: /// Information about shadow pointers. struct ShadowPtrInfoTy { void **HstPtrAddr = nullptr; - void *HstPtrVal = nullptr; void **TgtPtrAddr = nullptr; - void *TgtPtrVal = nullptr; + int64_t PtrSize = sizeof(void *); // Size of the pointer/descriptor + + // Store the complete contents for both host and target pointers/descriptors. + // 96 bytes is chosen as the "Small" size to cover simple Fortran + // descriptors of up to 3 dimensions. + llvm::SmallVector<char, 96> HstPtrContent; + llvm::SmallVector<char, 96> TgtPtrContent; + + ShadowPtrInfoTy(void **HstPtrAddr, void **TgtPtrAddr, void *TgtPteeBase, + int64_t PtrSize) + : HstPtrAddr(HstPtrAddr), TgtPtrAddr(TgtPtrAddr), PtrSize(PtrSize), + HstPtrContent(PtrSize), TgtPtrContent(PtrSize) { + constexpr int64_t VoidPtrSize = sizeof(void *); + assert(HstPtrAddr != nullptr && "HstPtrAddr is nullptr"); + assert(TgtPtrAddr != nullptr && "TgtPtrAddr is nullptr"); + assert(PtrSize >= VoidPtrSize && "PtrSize is less than sizeof(void *)"); + + void *HstPteeBase = *HstPtrAddr; + // The first VoidPtrSize bytes for HstPtrContent/TgtPtrContent are from + // HstPteeBase/TgtPteeBase. + std::memcpy(HstPtrContent.data(), &HstPteeBase, VoidPtrSize); + std::memcpy(TgtPtrContent.data(), &TgtPteeBase, VoidPtrSize); + + // If we are not dealing with Fortran descriptors (pointers larger than + // VoidPtrSize), then that's that. + if (PtrSize <= VoidPtrSize) + return; + + // For larger pointers, i.e. Fortran descriptors, the remaining contents of + // the descriptor come from the host descriptor, i.e. HstPtrAddr. + std::memcpy(HstPtrContent.data() + VoidPtrSize, + reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize, + PtrSize - VoidPtrSize); + std::memcpy(TgtPtrContent.data() + VoidPtrSize, + reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize, + PtrSize - VoidPtrSize); + } + + ShadowPtrInfoTy() = delete; bool operator==(const ShadowPtrInfoTy &Other) const { return HstPtrAddr == Other.HstPtrAddr; @@ -243,9 +280,25 @@ public: auto Pair = States->ShadowPtrInfos.insert(ShadowPtrInfo); if (Pair.second) return true; + // Check for a stale entry, if found, replace the old one. - if ((*Pair.first).TgtPtrVal == ShadowPtrInfo.TgtPtrVal) + + // For Fortran descriptors, we need to compare their full contents, + // as the starting address may be the same while other fields have + // been updated. e.g. + // + // !$omp target enter data map(x(1:100)) ! (1) + // p => x(10: 19) + // !$omp target enter data map(p, p(:)) ! (2) + // p => x(5: 9) + // !$omp target enter data map(attach(always): p(:)) ! (3) + // + // While &desc_p and &p(1) (TgtPtrAddr and first "sizeof(void*)" bytes of + // TgtPtrContent) are same for (2) and (3), the pointer attachment for (3) + // needs to update the bounds information in the descriptor of p on device. + if ((*Pair.first).TgtPtrContent == ShadowPtrInfo.TgtPtrContent) return false; + States->ShadowPtrInfos.erase(ShadowPtrInfo); return addShadowPointer(ShadowPtrInfo); } @@ -417,12 +470,42 @@ struct MapperComponentsTy { typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t, void *); +/// Structure to store information about a single ATTACH map entry. +struct AttachMapInfo { + void *PointerBase; + void *PointeeBegin; + int64_t PointerSize; + int64_t MapType; + map_var_info_t Pointername; + + AttachMapInfo(void *PointerBase, void *PointeeBegin, int64_t Size, + int64_t Type, map_var_info_t Name) + : PointerBase(PointerBase), PointeeBegin(PointeeBegin), PointerSize(Size), + MapType(Type), Pointername(Name) {} +}; + +/// Structure to track ATTACH entries and new allocations across recursive calls +/// (for handling mappers) to targetDataBegin for a given construct. +struct AttachInfoTy { + /// ATTACH map entries for deferred processing. + llvm::SmallVector<AttachMapInfo> AttachEntries; + + /// Key: host pointer, Value: allocation size. + llvm::DenseMap<void *, int64_t> NewAllocations; + + AttachInfoTy() = default; + + // Delete copy constructor and copy assignment operator to prevent copying + AttachInfoTy(const AttachInfoTy &) = delete; + AttachInfoTy &operator=(const AttachInfoTy &) = delete; +}; + // Function pointer type for targetData* functions (targetDataBegin, // targetDataEnd and targetDataUpdate). typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, map_var_info_t *, void **, AsyncInfoTy &, - bool); + AttachInfoTy *, bool); void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device, bool toStdOut = false); @@ -431,20 +514,26 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, - bool FromMapper = false); + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); +// Process deferred ATTACH map entries collected during targetDataBegin. +int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo, + AsyncInfoTy &AsyncInfo); + struct MappingInfoTy { MappingInfoTy(DeviceTy &Device) : Device(Device) {} diff --git a/offload/include/device.h b/offload/include/device.h index f4b10abbaa3f..bf93ce0460ae 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -33,7 +33,9 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" +#include "GlobalHandler.h" #include "PluginInterface.h" + using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy; // Forward declarations. @@ -98,6 +100,10 @@ struct DeviceTy { int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, int64_t Size, AsyncInfoTy &AsyncInfo); + // Insert a data fence between previous data operations and the following + // operations if necessary for the device. + int32_t dataFence(AsyncInfoTy &AsyncInfo); + /// Notify the plugin about a new mapping starting at the host address /// \p HstPtr and \p Size bytes. int32_t notifyDataMapped(void *HstPtr, int64_t Size); diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 625bbaa0db85..794b79e07674 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -77,6 +77,9 @@ enum tgt_map_type { // the structured region // This is an OpenMP extension for the sake of OpenACC support. OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000, + // Attach pointer and pointee, after processing all other maps. + // Applicable to map-entering directives. Does not change ref-count. + OMP_TGT_MAPTYPE_ATTACH = 0x4000, // descriptor for non-contiguous target-update OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000, // member of struct, member given by [16 MSBs] - 1 @@ -98,8 +101,6 @@ enum TargetAllocTy : int32_t { TARGET_ALLOC_HOST, TARGET_ALLOC_SHARED, TARGET_ALLOC_DEFAULT, - /// The allocation will not block on other streams. - TARGET_ALLOC_DEVICE_NON_BLOCKING, }; inline KernelArgsTy CTorDTorKernelArgs = { diff --git a/offload/liboffload/API/APIDefs.td b/offload/liboffload/API/APIDefs.td index 640932dcf846..ea3896fc3103 100644 --- a/offload/liboffload/API/APIDefs.td +++ b/offload/liboffload/API/APIDefs.td @@ -31,6 +31,13 @@ class IsHandleType<string Type> { !ne(!find(Type, "_handle_t", !sub(!size(Type), 9)), -1)); } +// Does the type end with '_cb_t'? +class IsCallbackType<string Type> { + // size("_cb_t") == 5 + bit ret = !if(!lt(!size(Type), 5), 0, + !ne(!find(Type, "_cb_t", !sub(!size(Type), 5)), -1)); +} + // Does the type end with '*'? class IsPointerType<string Type> { bit ret = !ne(!find(Type, "*", !sub(!size(Type), 1)), -1); @@ -58,6 +65,7 @@ class Param<string Type, string Name, string Desc, bits<3> Flags = 0> { TypeInfo type_info = TypeInfo<"", "">; bit IsHandle = IsHandleType<type>.ret; bit IsPointer = IsPointerType<type>.ret; + bit IsCallback = IsCallbackType<type>.ret; } // A parameter whose range is described by other parameters in the function. @@ -81,7 +89,7 @@ class ShouldCheckHandle<Param P> { } class ShouldCheckPointer<Param P> { - bit ret = !and(P.IsPointer, !eq(!and(PARAM_OPTIONAL, P.flags), 0)); + bit ret = !and(!or(P.IsPointer, P.IsCallback), !eq(!and(PARAM_OPTIONAL, P.flags), 0)); } // For a list of returns that contains a specific return code, find and append @@ -137,7 +145,6 @@ defvar DefaultReturns = [Return<PREFIX#"_RESULT_SUCCESS">, Return<PREFIX#"_ERRC_DEVICE_LOST">]; class APIObject { - string name; string desc; } @@ -168,6 +175,10 @@ class Enum : APIObject { // all Etor values must be TaggedEtor records bit is_typed = 0; + // This refers to whether the enumerator is used to name bits of a bit field, + // where consecutive values are bit-shifted rather than incremented. + bit is_bit_field = 0; + list<Etor> etors = []; } diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td index 6eaf604c8ebb..ac27d85b6c96 100644 --- a/offload/liboffload/API/Common.td +++ b/offload/liboffload/API/Common.td @@ -10,77 +10,64 @@ // //===----------------------------------------------------------------------===// -def : Macro { - let name = "OL_VERSION_MAJOR"; +def OL_VERSION_MAJOR : Macro { let desc = "Major version of the Offload API"; let value = "0"; } -def : Macro { - let name = "OL_VERSION_MINOR"; +def OL_VERSION_MINOR : Macro { let desc = "Minor version of the Offload API"; let value = "0"; } -def : Macro { - let name = "OL_VERSION_PATCH"; +def OL_VERSION_PATCH : Macro { let desc = "Patch version of the Offload API"; let value = "1"; } -def : Macro { - let name = "OL_APICALL"; +def OL_APICALL : Macro { let desc = "Calling convention for all API functions"; let condition = "defined(_WIN32)"; let value = "__cdecl"; let alt_value = ""; } -def : Macro { - let name = "OL_APIEXPORT"; +def OL_APIEXPORT : Macro { let desc = "Microsoft-specific dllexport storage-class attribute"; let condition = "defined(_WIN32)"; let value = "__declspec(dllexport)"; let alt_value = ""; } -def : Handle { - let name = "ol_platform_handle_t"; +def ol_platform_handle_t : Handle { let desc = "Handle of a platform instance"; } -def : Handle { - let name = "ol_device_handle_t"; +def ol_device_handle_t : Handle { let desc = "Handle of platform's device object"; } -def : Handle { - let name = "ol_context_handle_t"; +def ol_context_handle_t : Handle { let desc = "Handle of context object"; } -def : Handle { - let name = "ol_queue_handle_t"; +def ol_queue_handle_t : Handle { let desc = "Handle of queue object"; } -def : Handle { - let name = "ol_event_handle_t"; +def ol_event_handle_t : Handle { let desc = "Handle of event object"; } -def : Handle { - let name = "ol_program_handle_t"; +def ol_program_handle_t : Handle { let desc = "Handle of program object"; } -def : Handle { - let name = "ol_symbol_handle_t"; +def ol_symbol_handle_t : Handle { let desc = "Handle of an object in a device's memory for a specific program"; } -def ErrorCode : Enum { - let name = "ol_errc_t"; +def ol_errc_t : Enum { let desc = "Defines Return/Error codes"; let etors =[ Etor<"SUCCESS", "success">, @@ -115,8 +102,7 @@ def ErrorCode : Enum { ]; } -def : Struct { - let name = "ol_error_struct_t"; +def ol_error_struct_t : Struct { let desc = "Details of the error condition returned by an API call"; let members = [ StructMember<"ol_errc_t", "Code", "The error code">, @@ -124,20 +110,17 @@ def : Struct { ]; } -def : Typedef { - let name = "ol_result_t"; +def ol_result_t : Typedef { let desc = "Result type returned by all entry points."; - let value = "const ol_error_struct_t*"; + let value = "const struct ol_error_struct_t*"; } -def : Macro { - let name = "OL_SUCCESS"; +def OL_SUCCESS : Macro { let desc = "Success condition"; let value = "NULL"; } -def : Struct { - let name = "ol_code_location_t"; +def ol_code_location_t : Struct { let desc = "Code location information that can optionally be associated with an API call"; let members = [ StructMember<"const char*", "FunctionName", "Function name">, @@ -147,8 +130,7 @@ def : Struct { ]; } -def : Struct { - let name = "ol_dimensions_t"; +def ol_dimensions_t : Struct { let desc = "A three element vector"; let members = [ StructMember<"uint32_t", "x", "X">, @@ -157,8 +139,7 @@ def : Struct { ]; } -def : Function { - let name = "olInit"; +def olInit : Function { let desc = "Perform initialization of the Offload library and plugins"; let details = [ "This must be the first API call made by a user of the Offload library", @@ -168,8 +149,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olShutDown"; +def olShutDown : Function { let desc = "Release the resources in use by Offload"; let details = [ "This decrements an internal reference count. When this reaches 0, all resources will be released", diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td index 857c596124b2..5b54c79d83f9 100644 --- a/offload/liboffload/API/Device.td +++ b/offload/liboffload/API/Device.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Enum { - let name = "ol_device_type_t"; +def ol_device_type_t : Enum { let desc = "Supported device types."; let etors =[ Etor<"DEFAULT", "The default device type as preferred by the runtime">, @@ -22,23 +21,54 @@ def : Enum { ]; } -def DeviceInfo : Enum { - let name = "ol_device_info_t"; +def ol_device_info_t : Enum { let desc = "Supported device info."; let is_typed = 1; - let etors =[ + list<TaggedEtor> basic_etors =[ TaggedEtor<"TYPE", "ol_device_type_t", "type of the device">, TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">, TaggedEtor<"NAME", "char[]", "Device name">, + TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">, TaggedEtor<"VENDOR", "char[]", "Device vendor">, TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">, TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">, TaggedEtor<"MAX_WORK_GROUP_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work group size in each dimension">, + TaggedEtor<"MAX_WORK_SIZE", "uint32_t", "Maximum total work items">, + TaggedEtor<"MAX_WORK_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work items in each dimension">, + TaggedEtor<"VENDOR_ID", "uint32_t", "A unique vendor device identifier assigned by PCI-SIG">, + TaggedEtor<"NUM_COMPUTE_UNITS", "uint32_t", "The number of parallel compute units available to the device">, + TaggedEtor<"MAX_CLOCK_FREQUENCY", "uint32_t", "The maximum configured clock frequency of this device in MHz">, + TaggedEtor<"MEMORY_CLOCK_RATE", "uint32_t", "Memory clock frequency in MHz">, + TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">, + TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">, + TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">, ]; + list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">); + list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>); + let etors = !listconcat(basic_etors, fp_configs, native_vec_widths); +} + +def ol_device_fp_capability_flag_t : Enum { + let desc = "Device floating-point capability flags"; + let is_bit_field = 1; + let etors =[ + Etor<"CORRECTLY_ROUNDED_DIVIDE_SQRT", "Support correctly rounded divide and sqrt">, + Etor<"ROUND_TO_NEAREST", "Support round to nearest">, + Etor<"ROUND_TO_ZERO", "Support round to zero">, + Etor<"ROUND_TO_INF", "Support round to infinity">, + Etor<"INF_NAN", "Support INF to NAN">, + Etor<"DENORM", "Support denorm">, + Etor<"FMA", "Support fused multiply-add">, + Etor<"SOFT_FLOAT", "Basic floating point operations implemented in software">, + ]; +} + +def ol_device_fp_capability_flags_t : Typedef { + let desc = "Device floating-point capability flags"; + let value = "uint32_t"; } -def : FptrTypedef { - let name = "ol_device_iterate_cb_t"; +def ol_device_iterate_cb_t : FptrTypedef { let desc = "User-provided function to be used with `olIterateDevices`"; let params = [ Param<"ol_device_handle_t", "Device", "the device handle of the current iteration", PARAM_IN>, @@ -47,8 +77,7 @@ def : FptrTypedef { let return = "bool"; } -def : Function { - let name = "olIterateDevices"; +def olIterateDevices : Function { let desc = "Iterates over all available devices, calling the callback for each device."; let details = [ "If the user-provided callback returns `false`, the iteration is stopped." @@ -62,8 +91,7 @@ def : Function { ]; } -def : Function { - let name = "olGetDeviceInfo"; +def olGetDeviceInfo : Function { let desc = "Queries the given property of the device."; let details = []; let params = [ @@ -86,8 +114,7 @@ def : Function { ]; } -def : Function { - let name = "olGetDeviceInfoSize"; +def olGetDeviceInfoSize : Function { let desc = "Returns the storage size of the given device query."; let details = []; let params = [ diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td index 9d217ae23038..075bf5bafaa6 100644 --- a/offload/liboffload/API/Event.td +++ b/offload/liboffload/API/Event.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Function { - let name = "olCreateEvent"; +def olCreateEvent : Function { let desc = "Enqueue an event to `Queue` and return it."; let details = [ "This event can be used with `olSyncEvent` and `olWaitEvents` and will be complete once all enqueued work prior to the `olCreateEvent` call is complete.", @@ -23,8 +22,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olDestroyEvent"; +def olDestroyEvent : Function { let desc = "Destroy the event and free all underlying resources."; let details = []; let params = [ @@ -33,8 +31,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olSyncEvent"; +def olSyncEvent : Function { let desc = "Block the calling thread until the event is complete."; let details = []; let params = [ @@ -43,17 +40,16 @@ def : Function { let returns = []; } -def : Enum { - let name = "ol_event_info_t"; +def ol_event_info_t : Enum { let desc = "Supported event info."; let is_typed = 1; let etors = [ - TaggedEtor<"QUEUE", "ol_queue_handle_t", "The handle of the queue associated with the device."> + TaggedEtor<"QUEUE", "ol_queue_handle_t", "The handle of the queue associated with the device.">, + TaggedEtor<"IS_COMPLETE", "bool", "True if and only if the event is complete.">, ]; } -def : Function { - let name = "olGetEventInfo"; +def olGetEventInfo : Function { let desc = "Queries the given property of the event."; let details = [ "`olGetEventInfoSize` can be used to query the storage size " @@ -77,8 +73,7 @@ def : Function { ]; } -def : Function { - let name = "olGetEventInfoSize"; +def olGetEventInfoSize : Function { let desc = "Returns the storage size of the given event query."; let details = []; let params = [ diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td index 502fb36467db..2f5692a19d71 100644 --- a/offload/liboffload/API/Kernel.td +++ b/offload/liboffload/API/Kernel.td @@ -6,12 +6,11 @@ // //===----------------------------------------------------------------------===// // -// This file contains Offload API definitions related to launching kernels +// This file contains Offload API definitions related to kernels // //===----------------------------------------------------------------------===// -def : Struct { - let name = "ol_kernel_launch_size_args_t"; +def ol_kernel_launch_size_args_t : Struct { let desc = "Size-related arguments for a kernel launch."; let members = [ StructMember<"size_t", "Dimensions", "Number of work dimensions">, @@ -21,8 +20,7 @@ def : Struct { ]; } -def : Function { - let name = "olLaunchKernel"; +def olLaunchKernel : Function { let desc = "Enqueue a kernel launch with the specified size and parameters."; let details = [ "If a queue is not specified, kernel execution happens synchronously", @@ -42,3 +40,20 @@ def : Function { Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>, ]; } + +def olCalculateOptimalOccupancy : Function { + let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy."; + let details = [ + "For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.", + ]; + let params = [ + Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>, + Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>, + Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>, + Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT> + ]; + let returns = [ + Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>, + Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>, + ]; +} diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td index 5f7158588bc7..79e803833004 100644 --- a/offload/liboffload/API/Memory.td +++ b/offload/liboffload/API/Memory.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Enum { - let name = "ol_alloc_type_t"; +def ol_alloc_type_t : Enum { let desc = "Represents the type of allocation made with olMemAlloc."; let etors = [ Etor<"HOST", "Host allocation">, @@ -20,9 +19,11 @@ def : Enum { ]; } -def : Function { - let name = "olMemAlloc"; +def olMemAlloc : Function { let desc = "Creates a memory allocation on the specified device."; + let details = [ + "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory." + ]; let params = [ Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>, Param<"ol_alloc_type_t", "Type", "type of the allocation", PARAM_IN>, @@ -36,8 +37,7 @@ def : Function { ]; } -def : Function { - let name = "olMemFree"; +def olMemFree : Function { let desc = "Frees a memory allocation previously made by olMemAlloc."; let params = [ Param<"void*", "Address", "address of the allocation to free", PARAM_IN>, @@ -45,8 +45,57 @@ def : Function { let returns = []; } -def : Function { - let name = "olMemcpy"; +def ol_mem_info_t : Enum { + let desc = "Supported memory info."; + let is_typed = 1; + let etors = [ + TaggedEtor<"DEVICE", "ol_device_handle_t", "The handle of the device associated with the allocation.">, + TaggedEtor<"BASE", "void *", "Base address of this allocation.">, + TaggedEtor<"SIZE", "size_t", "Size of this allocation in bytes.">, + TaggedEtor<"TYPE", "ol_alloc_type_t", "Type of this allocation.">, + ]; +} + +def olGetMemInfo : Function { + let desc = "Queries the given property of a memory allocation allocated with olMemAlloc."; + let details = [ + "`olGetMemInfoSize` can be used to query the storage size required for the given query.", + "The provided pointer can point to any location inside the allocation.", + ]; + let params = [ + Param<"const void *", "Ptr", "pointer to the allocated memory", PARAM_IN>, + Param<"ol_mem_info_t", "PropName", "type of the info to retrieve", PARAM_IN>, + Param<"size_t", "PropSize", "the number of bytes pointed to by PropValue.", PARAM_IN>, + TypeTaggedParam<"void*", "PropValue", "array of bytes holding the info. " + "If Size is not equal to or greater to the real number of bytes needed to return the info " + "then the OL_ERRC_INVALID_SIZE error is returned and pPlatformInfo is not used.", PARAM_OUT, + TypeInfo<"PropName" , "PropSize">> + ]; + let returns = [ + Return<"OL_ERRC_INVALID_SIZE", [ + "`PropSize == 0`", + "If `PropSize` is less than the real number of bytes needed to return the info." + ]>, + Return<"OL_ERRC_NOT_FOUND", ["memory was not allocated by liboffload"]> + ]; +} + +def olGetMemInfoSize : Function { + let desc = "Returns the storage size of the given queue query."; + let details = [ + "The provided pointer can point to any location inside the allocation.", + ]; + let params = [ + Param<"const void *", "Ptr", "pointer to the allocated memory", PARAM_IN>, + Param<"ol_mem_info_t", "PropName", "type of the info to query", PARAM_IN>, + Param<"size_t*", "PropSizeRet", "pointer to the number of bytes required to store the query", PARAM_OUT> + ]; + let returns = [ + Return<"OL_ERRC_NOT_FOUND", ["memory was not allocated by liboffload"]> + ]; +} + +def olMemcpy : Function { let desc = "Enqueue a memcpy operation."; let details = [ "For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.", @@ -63,3 +112,22 @@ def : Function { ]; let returns = []; } + +def olMemFill : Function { + let desc = "Fill memory with copies of the given pattern"; + let details = [ + "Filling with patterns larger than 4 bytes may be less performant", + "The destination pointer and queue must be associated with the same device", + "The fill size must be a multiple of the pattern size", + ]; + let params = [ + Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>, + Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>, + Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>, + Param<"const void*", "PatternPtr", "", PARAM_IN>, + Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>, + ]; + let returns = [ + Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]> + ]; +} diff --git a/offload/liboffload/API/Platform.td b/offload/liboffload/API/Platform.td index 97c2cc2d0570..906f899076a8 100644 --- a/offload/liboffload/API/Platform.td +++ b/offload/liboffload/API/Platform.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Enum { - let name = "ol_platform_info_t"; +def ol_platform_info_t : Enum { let desc = "Supported platform info."; let is_typed = 1; let etors = [ @@ -22,8 +21,7 @@ def : Enum { ]; } -def : Enum { - let name = "ol_platform_backend_t"; +def ol_platform_backend_t : Enum { let desc = "Identifies the native backend of the platform."; let etors =[ Etor<"UNKNOWN", "The backend is not recognized">, @@ -33,8 +31,7 @@ def : Enum { ]; } -def : Function { - let name = "olGetPlatformInfo"; +def olGetPlatformInfo : Function { let desc = "Queries the given property of the platform."; let details = [ "`olGetPlatformInfoSize` can be used to query the storage size " @@ -61,8 +58,7 @@ def : Function { ]; } -def : Function { - let name = "olGetPlatformInfoSize"; +def olGetPlatformInfoSize : Function { let desc = "Returns the storage size of the given platform query."; let details = []; let params = [ diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td index 0476fa1f7c27..7e11b3d8e331 100644 --- a/offload/liboffload/API/Program.td +++ b/offload/liboffload/API/Program.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Function { - let name = "olCreateProgram"; +def olCreateProgram : Function { let desc = "Create a program for the device from the binary image pointed to by `ProgData`."; let details = [ "The provided `ProgData` will be copied and need not outlive the returned handle", @@ -25,8 +24,19 @@ def : Function { let returns = []; } -def : Function { - let name = "olDestroyProgram"; +def olIsValidBinary : Function { + let desc = "Validate if the binary image pointed to by `ProgData` is compatible with the device."; + let details = ["The provided `ProgData` will not be loaded onto the device"]; + let params = [ + Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>, + Param<"const void*", "ProgData", "pointer to the program binary data", PARAM_IN>, + Param<"size_t", "ProgDataSize", "size of the program binary in bytes", PARAM_IN>, + Param<"bool*", "Valid", "output is true if the image is compatible", PARAM_OUT> + ]; + let returns = []; +} + +def olDestroyProgram : Function { let desc = "Destroy the program and free all underlying resources."; let details = []; let params = [ diff --git a/offload/liboffload/API/Queue.td b/offload/liboffload/API/Queue.td index 1d9f6f2d11c9..ededa9cc92fe 100644 --- a/offload/liboffload/API/Queue.td +++ b/offload/liboffload/API/Queue.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Function { - let name = "olCreateQueue"; +def olCreateQueue : Function { let desc = "Create a queue for the given device."; let details = []; let params = [ @@ -21,8 +20,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olDestroyQueue"; +def olDestroyQueue : Function { let desc = "Destroy the queue and free all underlying resources."; let details = [ "Any work previously enqueued to the queue is still performed and any events generated for this queue remain valid." @@ -33,8 +31,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olSyncQueue"; +def olSyncQueue : Function { let desc = "Block the calling thread until the enqueued work on a queue is complete."; let details = []; let params = [ @@ -43,8 +40,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olWaitEvents"; +def olWaitEvents : Function { let desc = "Make any future work submitted to this queue wait until the provided events are complete."; let details = [ "All events in `Events` must complete before the queue is unblocked.", @@ -60,8 +56,7 @@ def : Function { ]; } -def : Enum { - let name = "ol_queue_info_t"; +def ol_queue_info_t : Enum { let desc = "Supported queue info."; let is_typed = 1; let etors = [ @@ -70,8 +65,7 @@ def : Enum { ]; } -def : Function { - let name = "olGetQueueInfo"; +def olGetQueueInfo : Function { let desc = "Queries the given property of the queue."; let details = [ "`olGetQueueInfoSize` can be used to query the storage size " @@ -95,8 +89,7 @@ def : Function { ]; } -def : Function { - let name = "olGetQueueInfoSize"; +def olGetQueueInfoSize : Function { let desc = "Returns the storage size of the given queue query."; let details = []; let params = [ @@ -108,3 +101,27 @@ def : Function { Return<"OL_ERRC_INVALID_QUEUE"> ]; } + +def ol_host_function_cb_t : FptrTypedef { + let desc = "Host function for use by `olLaunchHostFunction`."; + let params = [ + Param<"void *", "UserData", "user specified data passed into `olLaunchHostFunction`.", PARAM_IN>, + ]; + let return = "void"; +} + +def olLaunchHostFunction : Function { + let desc = "Enqueue a callback function on the host."; + let details = [ + "The provided function will be called from the same process as the one that called `olLaunchHostFunction`.", + "The callback will not run until all previous work submitted to the queue has completed.", + "The callback must return before any work submitted to the queue after it is started.", + "The callback must not call any liboffload API functions or any backend specific functions (such as Cuda or HSA library functions).", + ]; + let params = [ + Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>, + Param<"ol_host_function_cb_t", "Callback", "the callback function to call on the host", PARAM_IN>, + Param<"void *", "UserData", "a pointer that will be passed verbatim to the callback function", PARAM_IN_OPTIONAL>, + ]; + let returns = []; +} diff --git a/offload/liboffload/API/Symbol.td b/offload/liboffload/API/Symbol.td index 2e94d703809e..c57a2e1b8363 100644 --- a/offload/liboffload/API/Symbol.td +++ b/offload/liboffload/API/Symbol.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Enum { - let name = "ol_symbol_kind_t"; +def ol_symbol_kind_t : Enum { let desc = "The kind of a symbol"; let etors =[ Etor<"KERNEL", "a kernel object">, @@ -19,8 +18,7 @@ def : Enum { ]; } -def : Function { - let name = "olGetSymbol"; +def olGetSymbol : Function { let desc = "Get a symbol (kernel or global variable) identified by `Name` in the given program."; let details = [ "Symbol handles are owned by the program and do not need to be manually destroyed." @@ -34,8 +32,7 @@ def : Function { let returns = []; } -def : Enum { - let name = "ol_symbol_info_t"; +def ol_symbol_info_t : Enum { let desc = "Supported symbol info."; let is_typed = 1; let etors = [ @@ -45,8 +42,7 @@ def : Enum { ]; } -def : Function { - let name = "olGetSymbolInfo"; +def olGetSymbolInfo : Function { let desc = "Queries the given property of the symbol."; let details = [ "`olGetSymbolInfoSize` can be used to query the storage size " @@ -73,8 +69,7 @@ def : Function { ]; } -def : Function { - let name = "olGetSymbolInfoSize"; +def olGetSymbolInfoSize : Function { let desc = "Returns the storage size of the given symbol query."; let details = []; let params = [ diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index f5365ca27430..c549ae04361d 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -39,55 +39,131 @@ using namespace llvm::omp::target; using namespace llvm::omp::target::plugin; using namespace error; +struct ol_platform_impl_t { + ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin, + ol_platform_backend_t BackendType) + : Plugin(std::move(Plugin)), BackendType(BackendType) {} + std::unique_ptr<GenericPluginTy> Plugin; + llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices; + ol_platform_backend_t BackendType; + + /// Complete all pending work for this platform and perform any needed + /// cleanup. + /// + /// After calling this function, no liboffload functions should be called with + /// this platform handle. + llvm::Error destroy(); +}; + // Handle type definitions. Ideally these would be 1:1 with the plugins, but // we add some additional data here for now to avoid churn in the plugin // interface. struct ol_device_impl_t { ol_device_impl_t(int DeviceNum, GenericDeviceTy *Device, - ol_platform_handle_t Platform, InfoTreeNode &&DevInfo) + ol_platform_impl_t &Platform, InfoTreeNode &&DevInfo) : DeviceNum(DeviceNum), Device(Device), Platform(Platform), Info(std::forward<InfoTreeNode>(DevInfo)) {} + + ~ol_device_impl_t() { + assert(!OutstandingQueues.size() && + "Device object dropped with outstanding queues"); + } + int DeviceNum; GenericDeviceTy *Device; - ol_platform_handle_t Platform; + ol_platform_impl_t &Platform; InfoTreeNode Info; -}; -struct ol_platform_impl_t { - ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin, - ol_platform_backend_t BackendType) - : Plugin(std::move(Plugin)), BackendType(BackendType) {} - std::unique_ptr<GenericPluginTy> Plugin; - std::vector<ol_device_impl_t> Devices; - ol_platform_backend_t BackendType; + llvm::SmallVector<__tgt_async_info *> OutstandingQueues; + std::mutex OutstandingQueuesMutex; + + /// If the device has any outstanding queues that are now complete, remove it + /// from the list and return it. + /// + /// Queues may be added to the outstanding queue list by olDestroyQueue if + /// they are destroyed but not completed. + __tgt_async_info *getOutstandingQueue() { + // Not locking the `size()` access is fine here - In the worst case we + // either miss a queue that exists or loop through an empty array after + // taking the lock. Both are sub-optimal but not that bad. + if (OutstandingQueues.size()) { + std::lock_guard<std::mutex> Lock(OutstandingQueuesMutex); + + // As queues are pulled and popped from this list, longer running queues + // naturally bubble to the start of the array. Hence looping backwards. + for (auto Q = OutstandingQueues.rbegin(); Q != OutstandingQueues.rend(); + Q++) { + if (!Device->hasPendingWork(*Q)) { + auto OutstandingQueue = *Q; + *Q = OutstandingQueues.back(); + OutstandingQueues.pop_back(); + return OutstandingQueue; + } + } + } + return nullptr; + } + + /// Complete all pending work for this device and perform any needed cleanup. + /// + /// After calling this function, no liboffload functions should be called with + /// this device handle. + llvm::Error destroy() { + llvm::Error Result = Plugin::success(); + for (auto Q : OutstandingQueues) + if (auto Err = Device->synchronize(Q, /*Release=*/true)) + Result = llvm::joinErrors(std::move(Result), std::move(Err)); + OutstandingQueues.clear(); + return Result; + } }; +llvm::Error ol_platform_impl_t::destroy() { + llvm::Error Result = Plugin::success(); + for (auto &D : Devices) + if (auto Err = D->destroy()) + Result = llvm::joinErrors(std::move(Result), std::move(Err)); + + if (auto Res = Plugin->deinit()) + Result = llvm::joinErrors(std::move(Result), std::move(Res)); + + return Result; +} + struct ol_queue_impl_t { ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device) - : AsyncInfo(AsyncInfo), Device(Device) {} + : AsyncInfo(AsyncInfo), Device(Device), Id(IdCounter++) {} __tgt_async_info *AsyncInfo; ol_device_handle_t Device; + // A unique identifier for the queue + size_t Id; + static std::atomic<size_t> IdCounter; }; +std::atomic<size_t> ol_queue_impl_t::IdCounter(0); struct ol_event_impl_t { - ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue) - : EventInfo(EventInfo), Queue(Queue) {} + ol_event_impl_t(void *EventInfo, ol_device_handle_t Device, + ol_queue_handle_t Queue) + : EventInfo(EventInfo), Device(Device), QueueId(Queue->Id), Queue(Queue) { + } // EventInfo may be null, in which case the event should be considered always // complete void *EventInfo; + ol_device_handle_t Device; + size_t QueueId; + // Events may outlive the queue - don't assume this is always valid. + // It is provided only to implement OL_EVENT_INFO_QUEUE. Use QueueId to check + // for queue equality instead. ol_queue_handle_t Queue; }; struct ol_program_impl_t { ol_program_impl_t(plugin::DeviceImageTy *Image, - std::unique_ptr<llvm::MemoryBuffer> ImageData, - const __tgt_device_image &DeviceImage) - : Image(Image), ImageData(std::move(ImageData)), - DeviceImage(DeviceImage) {} + llvm::MemoryBufferRef DeviceImage) + : Image(Image), DeviceImage(DeviceImage) {} plugin::DeviceImageTy *Image; - std::unique_ptr<llvm::MemoryBuffer> ImageData; std::mutex SymbolListMutex; - __tgt_device_image DeviceImage; + llvm::MemoryBufferRef DeviceImage; llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> KernelSymbols; llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> GlobalSymbols; }; @@ -108,6 +184,9 @@ namespace offload { struct AllocInfo { ol_device_handle_t Device; ol_alloc_type_t Type; + void *Start; + // One byte past the end + void *End; }; // Global shared state for liboffload @@ -125,12 +204,16 @@ struct OffloadContext { bool TracingEnabled = false; bool ValidationEnabled = true; DenseMap<void *, AllocInfo> AllocInfoMap{}; - SmallVector<ol_platform_impl_t, 4> Platforms{}; + std::mutex AllocInfoMapMutex{}; + // Partitioned list of memory base addresses. Each element in this list is a + // key in AllocInfoMap + llvm::SmallVector<void *> AllocBases{}; + SmallVector<std::unique_ptr<ol_platform_impl_t>, 4> Platforms{}; size_t RefCount; ol_device_handle_t HostDevice() { // The host platform is always inserted last - return &Platforms.back().Devices[0]; + return Platforms.back()->Devices[0].get(); } static OffloadContext &get() { @@ -169,37 +252,35 @@ Error initPlugins(OffloadContext &Context) { // Attempt to create an instance of each supported plugin. #define PLUGIN_TARGET(Name) \ do { \ - Context.Platforms.emplace_back(ol_platform_impl_t{ \ - std::unique_ptr<GenericPluginTy>(createPlugin_##Name()), \ - pluginNameToBackend(#Name)}); \ + if (StringRef(#Name) != "host") \ + Context.Platforms.emplace_back(std::make_unique<ol_platform_impl_t>( \ + std::unique_ptr<GenericPluginTy>(createPlugin_##Name()), \ + pluginNameToBackend(#Name))); \ } while (false); #include "Shared/Targets.def" // Preemptively initialize all devices in the plugin for (auto &Platform : Context.Platforms) { - // Do not use the host plugin - it isn't supported. - if (Platform.BackendType == OL_PLATFORM_BACKEND_UNKNOWN) - continue; - auto Err = Platform.Plugin->init(); + auto Err = Platform->Plugin->init(); [[maybe_unused]] std::string InfoMsg = toString(std::move(Err)); - for (auto DevNum = 0; DevNum < Platform.Plugin->number_of_devices(); + for (auto DevNum = 0; DevNum < Platform->Plugin->number_of_devices(); DevNum++) { - if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) { - auto Device = &Platform.Plugin->getDevice(DevNum); + if (Platform->Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) { + auto Device = &Platform->Plugin->getDevice(DevNum); auto Info = Device->obtainInfoImpl(); if (auto Err = Info.takeError()) return Err; - Platform.Devices.emplace_back(DevNum, Device, &Platform, - std::move(*Info)); + Platform->Devices.emplace_back(std::make_unique<ol_device_impl_t>( + DevNum, Device, *Platform, std::move(*Info))); } } } // Add the special host device auto &HostPlatform = Context.Platforms.emplace_back( - ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST}); - HostPlatform.Devices.emplace_back(-1, nullptr, nullptr, InfoTreeNode{}); - Context.HostDevice()->Platform = &HostPlatform; + std::make_unique<ol_platform_impl_t>(nullptr, OL_PLATFORM_BACKEND_HOST)); + HostPlatform->Devices.emplace_back(std::make_unique<ol_device_impl_t>( + -1, nullptr, *HostPlatform, InfoTreeNode{})); Context.TracingEnabled = std::getenv("OFFLOAD_TRACE"); Context.ValidationEnabled = !std::getenv("OFFLOAD_DISABLE_VALIDATION"); @@ -236,10 +317,10 @@ Error olShutDown_impl() { for (auto &P : OldContext->Platforms) { // Host plugin is nullptr and has no deinit - if (!P.Plugin || !P.Plugin->is_initialized()) + if (!P->Plugin || !P->Plugin->is_initialized()) continue; - if (auto Res = P.Plugin->deinit()) + if (auto Res = P->destroy()) Result = llvm::joinErrors(std::move(Result), std::move(Res)); } @@ -302,10 +383,57 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, }; // These are not implemented by the plugin interface - if (PropName == OL_DEVICE_INFO_PLATFORM) - return Info.write<void *>(Device->Platform); - if (PropName == OL_DEVICE_INFO_TYPE) + switch (PropName) { + case OL_DEVICE_INFO_PLATFORM: + return Info.write<void *>(&Device->Platform); + + case OL_DEVICE_INFO_TYPE: return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_GPU); + + case OL_DEVICE_INFO_SINGLE_FP_CONFIG: + case OL_DEVICE_INFO_DOUBLE_FP_CONFIG: { + ol_device_fp_capability_flags_t flags{0}; + flags |= OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + OL_DEVICE_FP_CAPABILITY_FLAG_DENORM | + OL_DEVICE_FP_CAPABILITY_FLAG_FMA; + return Info.write(flags); + } + + case OL_DEVICE_INFO_HALF_FP_CONFIG: + return Info.write<ol_device_fp_capability_flags_t>(0); + + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: + return Info.write<uint32_t>(1); + + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: + return Info.write<uint32_t>(0); + + // None of the existing plugins specify a limit on a single allocation, + // so return the global memory size instead + case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: + [[fallthrough]]; + // AMD doesn't provide the global memory size (trivially) with the device info + // struct, so use the plugin interface + case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: { + uint64_t Mem; + if (auto Err = Device->Device->getDeviceMemorySize(Mem)) + return Err; + return Info.write<uint64_t>(Mem); + } break; + + default: + break; + } + if (PropName >= OL_DEVICE_INFO_LAST) return createOffloadError(ErrorCode::INVALID_ENUMERATION, "getDeviceInfo enum '%i' is invalid", PropName); @@ -316,8 +444,10 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, "plugin did not provide a response for this information"); auto Entry = *EntryOpt; + // Retrieve properties from the plugin interface switch (PropName) { case OL_DEVICE_INFO_NAME: + case OL_DEVICE_INFO_PRODUCT_NAME: case OL_DEVICE_INFO_VENDOR: case OL_DEVICE_INFO_DRIVER_VERSION: { // String values @@ -327,7 +457,13 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.writeString(std::get<std::string>(Entry->Value).c_str()); } - case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { + case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: + case OL_DEVICE_INFO_MAX_WORK_SIZE: + case OL_DEVICE_INFO_VENDOR_ID: + case OL_DEVICE_INFO_NUM_COMPUTE_UNITS: + case OL_DEVICE_INFO_ADDRESS_BITS: + case OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY: + case OL_DEVICE_INFO_MEMORY_CLOCK_RATE: { // Uint32 values if (!std::holds_alternative<uint64_t>(Entry->Value)) return makeError(ErrorCode::BACKEND_FAILURE, @@ -339,6 +475,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.write(static_cast<uint32_t>(Value)); } + case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION: case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: { // {x, y, z} triples ol_dimensions_t Out{0, 0, 0}; @@ -377,21 +514,61 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, assert(Device == OffloadContext::get().HostDevice()); InfoWriter Info(PropSize, PropValue, PropSizeRet); + constexpr auto uint32_max = std::numeric_limits<uint32_t>::max(); + switch (PropName) { case OL_DEVICE_INFO_PLATFORM: - return Info.write<void *>(Device->Platform); + return Info.write<void *>(&Device->Platform); case OL_DEVICE_INFO_TYPE: return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_HOST); case OL_DEVICE_INFO_NAME: return Info.writeString("Virtual Host Device"); + case OL_DEVICE_INFO_PRODUCT_NAME: + return Info.writeString("Virtual Host Device"); case OL_DEVICE_INFO_VENDOR: return Info.writeString("Liboffload"); case OL_DEVICE_INFO_DRIVER_VERSION: return Info.writeString(LLVM_VERSION_STRING); case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: - return Info.write<uint64_t>(1); + return Info.write<uint32_t>(1); case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: return Info.write<ol_dimensions_t>(ol_dimensions_t{1, 1, 1}); + case OL_DEVICE_INFO_MAX_WORK_SIZE: + return Info.write<uint32_t>(uint32_max); + case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION: + return Info.write<ol_dimensions_t>( + ol_dimensions_t{uint32_max, uint32_max, uint32_max}); + case OL_DEVICE_INFO_VENDOR_ID: + return Info.write<uint32_t>(0); + case OL_DEVICE_INFO_NUM_COMPUTE_UNITS: + return Info.write<uint32_t>(1); + case OL_DEVICE_INFO_SINGLE_FP_CONFIG: + case OL_DEVICE_INFO_DOUBLE_FP_CONFIG: + return Info.write<ol_device_fp_capability_flags_t>( + OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + OL_DEVICE_FP_CAPABILITY_FLAG_DENORM | OL_DEVICE_FP_CAPABILITY_FLAG_FMA); + case OL_DEVICE_INFO_HALF_FP_CONFIG: + return Info.write<ol_device_fp_capability_flags_t>(0); + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: + return Info.write<uint32_t>(1); + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: + return Info.write<uint32_t>(0); + case OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY: + case OL_DEVICE_INFO_MEMORY_CLOCK_RATE: + case OL_DEVICE_INFO_ADDRESS_BITS: + return Info.write<uint32_t>(std::numeric_limits<uintptr_t>::digits); + case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: + case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: + return Info.write<uint64_t>(0); default: return createOffloadError(ErrorCode::INVALID_ENUMERATION, "getDeviceInfo enum '%i' is invalid", PropName); @@ -419,8 +596,8 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device, Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) { for (auto &Platform : OffloadContext::get().Platforms) { - for (auto &Device : Platform.Devices) { - if (!Callback(&Device, UserData)) { + for (auto &Device : Platform->Devices) { + if (!Callback(Device.get(), UserData)) { break; } } @@ -441,47 +618,184 @@ TargetAllocTy convertOlToPluginAllocTy(ol_alloc_type_t Type) { } } +constexpr size_t MAX_ALLOC_TRIES = 50; Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type, size_t Size, void **AllocationOut) { - auto Alloc = - Device->Device->dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type)); - if (!Alloc) - return Alloc.takeError(); - - *AllocationOut = *Alloc; - OffloadContext::get().AllocInfoMap.insert_or_assign(*Alloc, - AllocInfo{Device, Type}); - return Error::success(); + SmallVector<void *> Rejects; + + // Repeat the allocation up to a certain amount of times. If it happens to + // already be allocated (e.g. by a device from another vendor) throw it away + // and try again. + for (size_t Count = 0; Count < MAX_ALLOC_TRIES; Count++) { + auto NewAlloc = Device->Device->dataAlloc(Size, nullptr, + convertOlToPluginAllocTy(Type)); + if (!NewAlloc) + return NewAlloc.takeError(); + + void *NewEnd = &static_cast<char *>(*NewAlloc)[Size]; + auto &AllocBases = OffloadContext::get().AllocBases; + auto &AllocInfoMap = OffloadContext::get().AllocInfoMap; + { + std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex); + + // Check that this memory region doesn't overlap another one + // That is, the start of this allocation needs to be after another + // allocation's end point, and the end of this allocation needs to be + // before the next one's start. + // `Gap` is the first alloc who ends after the new alloc's start point. + auto Gap = + std::lower_bound(AllocBases.begin(), AllocBases.end(), *NewAlloc, + [&](const void *Iter, const void *Val) { + return AllocInfoMap.at(Iter).End <= Val; + }); + if (Gap == AllocBases.end() || NewEnd <= AllocInfoMap.at(*Gap).Start) { + // Success, no conflict + AllocInfoMap.insert_or_assign( + *NewAlloc, AllocInfo{Device, Type, *NewAlloc, NewEnd}); + AllocBases.insert( + std::lower_bound(AllocBases.begin(), AllocBases.end(), *NewAlloc), + *NewAlloc); + *AllocationOut = *NewAlloc; + + for (void *R : Rejects) + if (auto Err = + Device->Device->dataDelete(R, convertOlToPluginAllocTy(Type))) + return Err; + return Error::success(); + } + + // To avoid the next attempt allocating the same memory we just freed, we + // hold onto it until we complete the allocation + Rejects.push_back(*NewAlloc); + } + } + + // We've tried multiple times, and can't allocate a non-overlapping region. + return createOffloadError(ErrorCode::BACKEND_FAILURE, + "failed to allocate non-overlapping memory"); } Error olMemFree_impl(void *Address) { - if (!OffloadContext::get().AllocInfoMap.contains(Address)) - return createOffloadError(ErrorCode::INVALID_ARGUMENT, - "address is not a known allocation"); - - auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address); - auto Device = AllocInfo.Device; - auto Type = AllocInfo.Type; + ol_device_handle_t Device; + ol_alloc_type_t Type; + { + std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex); + if (!OffloadContext::get().AllocInfoMap.contains(Address)) + return createOffloadError(ErrorCode::INVALID_ARGUMENT, + "address is not a known allocation"); + + auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address); + Device = AllocInfo.Device; + Type = AllocInfo.Type; + OffloadContext::get().AllocInfoMap.erase(Address); + + auto &Bases = OffloadContext::get().AllocBases; + Bases.erase(std::lower_bound(Bases.begin(), Bases.end(), Address)); + } if (auto Res = Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type))) return Res; - OffloadContext::get().AllocInfoMap.erase(Address); + return Error::success(); +} + +Error olGetMemInfoImplDetail(const void *Ptr, ol_mem_info_t PropName, + size_t PropSize, void *PropValue, + size_t *PropSizeRet) { + InfoWriter Info(PropSize, PropValue, PropSizeRet); + std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex); + + auto &AllocBases = OffloadContext::get().AllocBases; + auto &AllocInfoMap = OffloadContext::get().AllocInfoMap; + const AllocInfo *Alloc = nullptr; + if (AllocInfoMap.contains(Ptr)) { + // Fast case, we have been given the base pointer directly + Alloc = &AllocInfoMap.at(Ptr); + } else { + // Slower case, we need to look up the base pointer first + // Find the first memory allocation whose end is after the target pointer, + // and then check to see if it is in range + auto Loc = std::lower_bound(AllocBases.begin(), AllocBases.end(), Ptr, + [&](const void *Iter, const void *Val) { + return AllocInfoMap.at(Iter).End <= Val; + }); + if (Loc == AllocBases.end() || Ptr < AllocInfoMap.at(*Loc).Start) + return Plugin::error(ErrorCode::NOT_FOUND, + "allocated memory information not found"); + Alloc = &AllocInfoMap.at(*Loc); + } + + switch (PropName) { + case OL_MEM_INFO_DEVICE: + return Info.write<ol_device_handle_t>(Alloc->Device); + case OL_MEM_INFO_BASE: + return Info.write<void *>(Alloc->Start); + case OL_MEM_INFO_SIZE: + return Info.write<size_t>(static_cast<char *>(Alloc->End) - + static_cast<char *>(Alloc->Start)); + case OL_MEM_INFO_TYPE: + return Info.write<ol_alloc_type_t>(Alloc->Type); + default: + return createOffloadError(ErrorCode::INVALID_ENUMERATION, + "olGetMemInfo enum '%i' is invalid", PropName); + } return Error::success(); } +Error olGetMemInfo_impl(const void *Ptr, ol_mem_info_t PropName, + size_t PropSize, void *PropValue) { + return olGetMemInfoImplDetail(Ptr, PropName, PropSize, PropValue, nullptr); +} + +Error olGetMemInfoSize_impl(const void *Ptr, ol_mem_info_t PropName, + size_t *PropSizeRet) { + return olGetMemInfoImplDetail(Ptr, PropName, 0, nullptr, PropSizeRet); +} + Error olCreateQueue_impl(ol_device_handle_t Device, ol_queue_handle_t *Queue) { auto CreatedQueue = std::make_unique<ol_queue_impl_t>(nullptr, Device); - if (auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo))) + + auto OutstandingQueue = Device->getOutstandingQueue(); + if (OutstandingQueue) { + // The queue is empty, but we still need to sync it to release any temporary + // memory allocations or do other cleanup. + if (auto Err = + Device->Device->synchronize(OutstandingQueue, /*Release=*/false)) + return Err; + CreatedQueue->AsyncInfo = OutstandingQueue; + } else if (auto Err = + Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo))) { return Err; + } *Queue = CreatedQueue.release(); return Error::success(); } -Error olDestroyQueue_impl(ol_queue_handle_t Queue) { return olDestroy(Queue); } +Error olDestroyQueue_impl(ol_queue_handle_t Queue) { + auto *Device = Queue->Device; + // This is safe; as soon as olDestroyQueue is called it is not possible to add + // any more work to the queue, so if it's finished now it will remain finished + // forever. + auto Res = Device->Device->hasPendingWork(Queue->AsyncInfo); + if (!Res) + return Res.takeError(); + + if (!*Res) { + // The queue is complete, so sync it and throw it back into the pool. + if (auto Err = Device->Device->synchronize(Queue->AsyncInfo, + /*Release=*/true)) + return Err; + } else { + // The queue still has outstanding work. Store it so we can check it later. + std::lock_guard<std::mutex> Lock(Device->OutstandingQueuesMutex); + Device->OutstandingQueues.push_back(Queue->AsyncInfo); + } + + return olDestroy(Queue); +} Error olSyncQueue_impl(ol_queue_handle_t Queue) { // Host plugin doesn't have a queue set so it's not safe to call synchronize @@ -509,7 +823,7 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events, "olWaitEvents asked to wait on a NULL event"); // Do nothing if the event is for this queue or the event is always complete - if (Event->Queue == Queue || !Event->EventInfo) + if (Event->QueueId == Queue->Id || !Event->EventInfo) continue; if (auto Err = Device->waitEvent(Event->EventInfo, Queue->AsyncInfo)) @@ -553,11 +867,11 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName, } Error olSyncEvent_impl(ol_event_handle_t Event) { + // No event info means that this event was complete on creation if (!Event->EventInfo) - // Event always complete return Plugin::success(); - if (auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo)) + if (auto Res = Event->Device->Device->syncEvent(Event->EventInfo)) return Res; return Error::success(); @@ -565,7 +879,7 @@ Error olSyncEvent_impl(ol_event_handle_t Event) { Error olDestroyEvent_impl(ol_event_handle_t Event) { if (Event->EventInfo) - if (auto Res = Event->Queue->Device->Device->destroyEvent(Event->EventInfo)) + if (auto Res = Event->Device->Device->destroyEvent(Event->EventInfo)) return Res; return olDestroy(Event); @@ -575,10 +889,22 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event, ol_event_info_t PropName, size_t PropSize, void *PropValue, size_t *PropSizeRet) { InfoWriter Info(PropSize, PropValue, PropSizeRet); + auto Queue = Event->Queue; switch (PropName) { case OL_EVENT_INFO_QUEUE: - return Info.write<ol_queue_handle_t>(Event->Queue); + return Info.write<ol_queue_handle_t>(Queue); + case OL_EVENT_INFO_IS_COMPLETE: { + // No event info means that this event was complete on creation + if (!Event->EventInfo) + return Info.write<bool>(true); + + auto Res = Queue->Device->Device->isEventComplete(Event->EventInfo, + Queue->AsyncInfo); + if (auto Err = Res.takeError()) + return Err; + return Info.write<bool>(*Res); + } default: return createOffloadError(ErrorCode::INVALID_ENUMERATION, "olGetEventInfo enum '%i' is invalid", PropName); @@ -604,7 +930,7 @@ Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) { if (auto Err = Pending.takeError()) return Err; - *EventOut = new ol_event_impl_t(nullptr, Queue); + *EventOut = new ol_event_impl_t(nullptr, Queue->Device, Queue); if (!*Pending) // Queue is empty, don't record an event and consider the event always // complete @@ -656,31 +982,31 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr, return Error::success(); } +Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize, + const void *PatternPtr, size_t FillSize) { + return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize, + Queue->AsyncInfo); +} + Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData, size_t ProgDataSize, ol_program_handle_t *Program) { - // Make a copy of the program binary in case it is released by the caller. - auto ImageData = MemoryBuffer::getMemBufferCopy( - StringRef(reinterpret_cast<const char *>(ProgData), ProgDataSize)); - - auto DeviceImage = __tgt_device_image{ - const_cast<char *>(ImageData->getBuffer().data()), - const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize, nullptr, - nullptr}; - - ol_program_handle_t Prog = - new ol_program_impl_t(nullptr, std::move(ImageData), DeviceImage); - - auto Res = - Device->Device->loadBinary(Device->Device->Plugin, &Prog->DeviceImage); - if (!Res) { - delete Prog; + StringRef Buffer(reinterpret_cast<const char *>(ProgData), ProgDataSize); + Expected<plugin::DeviceImageTy *> Res = + Device->Device->loadBinary(Device->Device->Plugin, Buffer); + if (!Res) return Res.takeError(); - } - assert(*Res != nullptr && "loadBinary returned nullptr"); + assert(*Res && "loadBinary returned nullptr"); - Prog->Image = *Res; - *Program = Prog; + *Program = new ol_program_impl_t(*Res, (*Res)->getMemoryBuffer()); + return Error::success(); +} +Error olIsValidBinary_impl(ol_device_handle_t Device, const void *ProgData, + size_t ProgDataSize, bool *IsValid) { + StringRef Buffer(reinterpret_cast<const char *>(ProgData), ProgDataSize); + *IsValid = Device->Device ? Device->Device->Plugin.isDeviceCompatible( + Device->Device->getDeviceId(), Buffer) + : false; return Error::success(); } @@ -696,6 +1022,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) { return olDestroy(Program); } +Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device, + ol_symbol_handle_t Kernel, + size_t DynamicMemSize, + size_t *GroupSize) { + if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL) + return createOffloadError(ErrorCode::SYMBOL_KIND, + "provided symbol is not a kernel"); + auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl); + + auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize); + if (auto Err = Res.takeError()) + return Err; + + *GroupSize = *Res; + + return Error::success(); +} + Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, ol_symbol_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, @@ -765,7 +1109,7 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name, return Error::success(); } case OL_SYMBOL_KIND_GLOBAL_VARIABLE: { - auto &Global = Program->KernelSymbols[Name]; + auto &Global = Program->GlobalSymbols[Name]; if (!Global) { GlobalTy GlobalObj{Name}; if (auto Res = @@ -833,5 +1177,12 @@ Error olGetSymbolInfoSize_impl(ol_symbol_handle_t Symbol, return olGetSymbolInfoImplDetail(Symbol, PropName, 0, nullptr, PropSizeRet); } +Error olLaunchHostFunction_impl(ol_queue_handle_t Queue, + ol_host_function_cb_t Callback, + void *UserData) { + return Queue->Device->Device->enqueueHostCall(Callback, UserData, + Queue->AsyncInfo); +} + } // namespace offload } // namespace llvm diff --git a/offload/libomptarget/OpenMP/InteropAPI.cpp b/offload/libomptarget/OpenMP/InteropAPI.cpp index eb5425ecbf06..c55ef2c2e672 100644 --- a/offload/libomptarget/OpenMP/InteropAPI.cpp +++ b/offload/libomptarget/OpenMP/InteropAPI.cpp @@ -124,7 +124,7 @@ void *getProperty<void *>(omp_interop_val_t &InteropVal, case omp_ipr_device_context: return InteropVal.device_info.Context; case omp_ipr_targetsync: - return InteropVal.async_info->Queue; + return InteropVal.async_info ? InteropVal.async_info->Queue : nullptr; default:; } getTypeMismatch(Property, Err); @@ -167,7 +167,6 @@ bool getPropertyCheck(omp_interop_val_t **InteropPtr, omp_interop_property_t property_id, \ int *err) { \ omp_interop_val_t *interop_val = (omp_interop_val_t *)interop; \ - assert((interop_val)->interop_type == kmp_interop_type_targetsync); \ if (!getPropertyCheck(&interop_val, property_id, err)) { \ return (RETURN_TYPE)(0); \ } \ @@ -275,8 +274,8 @@ omp_interop_val_t *__tgt_interop_get(ident_t *LocRef, int32_t InteropType, return Interop; } -int __tgt_interop_use(ident_t *LocRef, omp_interop_val_t *Interop, - interop_ctx_t *Ctx, dep_pack_t *Deps) { +int __tgt_interop_use60(ident_t *LocRef, omp_interop_val_t *Interop, + interop_ctx_t *Ctx, dep_pack_t *Deps) { bool Nowait = Ctx->flags.nowait; DP("Call to %s with interop " DPxMOD ", nowait %" PRId32 "\n", __func__, DPxPTR(Interop), Nowait); @@ -359,6 +358,40 @@ EXTERN int ompx_interop_add_completion_callback(omp_interop_val_t *Interop, return omp_irc_success; } +// Backwards compatibility wrappers +void __tgt_interop_init(ident_t *LocRef, int32_t Gtid, + omp_interop_val_t *&InteropPtr, int32_t InteropType, + int32_t DeviceId, int32_t Ndeps, + kmp_depend_info_t *DepList, int32_t HaveNowait) { + constexpr int32_t old_kmp_interop_type_targetsync = 2; + interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid}; + dep_pack_t Deps = {Ndeps, 0, DepList, nullptr}; + InteropPtr = + __tgt_interop_get(LocRef, + InteropType == old_kmp_interop_type_targetsync + ? kmp_interop_type_targetsync + : kmp_interop_type_target, + DeviceId, 0, nullptr, &Ctx, Ndeps ? &Deps : nullptr); +} + +void __tgt_interop_use(ident_t *LocRef, int32_t Gtid, + omp_interop_val_t *&InteropPtr, int32_t DeviceId, + int32_t Ndeps, kmp_depend_info_t *DepList, + int32_t HaveNowait) { + interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid}; + dep_pack_t Deps = {Ndeps, 0, DepList, nullptr}; + __tgt_interop_use60(LocRef, InteropPtr, &Ctx, Ndeps ? &Deps : nullptr); +} + +void __tgt_interop_destroy(ident_t *LocRef, int32_t Gtid, + omp_interop_val_t *&InteropPtr, int32_t DeviceId, + int32_t Ndeps, kmp_depend_info_t *DepList, + int32_t HaveNowait) { + interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid}; + dep_pack_t Deps = {Ndeps, 0, DepList, nullptr}; + __tgt_interop_release(LocRef, InteropPtr, &Ctx, Ndeps ? &Deps : nullptr); +} + } // extern "C" llvm::Expected<DeviceTy &> omp_interop_val_t::getDevice() const { diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp index b57a2f815cba..c8d6b42114d0 100644 --- a/offload/libomptarget/PluginManager.cpp +++ b/offload/libomptarget/PluginManager.cpp @@ -219,7 +219,10 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) { // Scan the RTLs that have associated images until we find one that supports // the current image. for (auto &R : plugins()) { - if (!R.is_plugin_compatible(Img)) + StringRef Buffer(reinterpret_cast<const char *>(Img->ImageStart), + utils::getPtrDiff(Img->ImageEnd, Img->ImageStart)); + + if (!R.isPluginCompatible(Buffer)) continue; if (!initializePlugin(R)) @@ -242,7 +245,7 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) { continue; } - if (!R.is_device_compatible(DeviceId, Img)) + if (!R.isDeviceCompatible(DeviceId, Buffer)) continue; DP("Image " DPxMOD " is compatible with RTL %s device %d!\n", diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index f88e30ae9e76..71423ae0c94d 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -37,6 +37,8 @@ using namespace llvm::omp::target::ompt; #endif +using namespace llvm::omp::target::plugin; + int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const { // First, check if the user disabled atomic map transfer/malloc/dealloc. @@ -97,7 +99,55 @@ llvm::Error DeviceTy::init() { return llvm::Error::success(); } -// Load binary to device. +// Extract the mapping of host function pointers to device function pointers +// from the entry table. Functions marked as 'indirect' in OpenMP will have +// offloading entries generated for them which map the host's function pointer +// to a global containing the corresponding function pointer on the device. +static llvm::Expected<std::pair<void *, uint64_t>> +setupIndirectCallTable(DeviceTy &Device, __tgt_device_image *Image, + __tgt_device_binary Binary) { + AsyncInfoTy AsyncInfo(Device); + llvm::ArrayRef<llvm::offloading::EntryTy> Entries(Image->EntriesBegin, + Image->EntriesEnd); + llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable; + for (const auto &Entry : Entries) { + if (Entry.Kind != llvm::object::OffloadKind::OFK_OpenMP || + Entry.Size == 0 || !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT)) + continue; + + assert(Entry.Size == sizeof(void *) && "Global not a function pointer?"); + auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); + + void *Ptr; + if (Device.RTL->get_global(Binary, Entry.Size, Entry.SymbolName, &Ptr)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to load %s", Entry.SymbolName); + + HstPtr = Entry.Address; + if (Device.retrieveData(&DevPtr, Ptr, Entry.Size, AsyncInfo)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to load %s", Entry.SymbolName); + } + + // If we do not have any indirect globals we exit early. + if (IndirectCallTable.empty()) + return std::pair{nullptr, 0}; + + // Sort the array to allow for more efficient lookup of device pointers. + llvm::sort(IndirectCallTable, + [](const auto &x, const auto &y) { return x.first < y.first; }); + + uint64_t TableSize = + IndirectCallTable.size() * sizeof(std::pair<void *, void *>); + void *DevicePtr = Device.allocData(TableSize, nullptr, TARGET_ALLOC_DEVICE); + if (Device.submitData(DevicePtr, IndirectCallTable.data(), TableSize, + AsyncInfo)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to copy data"); + return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size()); +} + +// Load binary to device and perform global initialization if needed. llvm::Expected<__tgt_device_binary> DeviceTy::loadBinary(__tgt_device_image *Img) { __tgt_device_binary Binary; @@ -105,6 +155,38 @@ DeviceTy::loadBinary(__tgt_device_image *Img) { if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS) return error::createOffloadError(error::ErrorCode::INVALID_BINARY, "failed to load binary %p", Img); + + // This symbol is optional. + void *DeviceEnvironmentPtr; + if (RTL->get_global(Binary, sizeof(DeviceEnvironmentTy), + "__omp_rtl_device_environment", &DeviceEnvironmentPtr)) + return Binary; + + // Obtain a table mapping host function pointers to device function pointers. + auto CallTablePairOrErr = setupIndirectCallTable(*this, Img, Binary); + if (!CallTablePairOrErr) + return CallTablePairOrErr.takeError(); + + GenericDeviceTy &GenericDevice = RTL->getDevice(RTLDeviceID); + DeviceEnvironmentTy DeviceEnvironment; + DeviceEnvironment.DeviceDebugKind = GenericDevice.getDebugKind(); + DeviceEnvironment.NumDevices = RTL->getNumDevices(); + // TODO: The device ID used here is not the real device ID used by OpenMP. + DeviceEnvironment.DeviceNum = RTLDeviceID; + DeviceEnvironment.DynamicMemSize = GenericDevice.getDynamicMemorySize(); + DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency(); + DeviceEnvironment.IndirectCallTable = + reinterpret_cast<uintptr_t>(CallTablePairOrErr->first); + DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second; + DeviceEnvironment.HardwareParallelism = + GenericDevice.getHardwareParallelism(); + + AsyncInfoTy AsyncInfo(*this); + if (submitData(DeviceEnvironmentPtr, &DeviceEnvironment, + sizeof(DeviceEnvironment), AsyncInfo)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to copy data"); + return Binary; } @@ -191,6 +273,10 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, DstPtr, Size, AsyncInfo); } +int32_t DeviceTy::dataFence(AsyncInfoTy &AsyncInfo) { + return RTL->data_fence(RTLDeviceID, AsyncInfo); +} + int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) { DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n", DPxPTR(HstPtr), Size); diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports index 8e2db6ba8bba..1374bfea8151 100644 --- a/offload/libomptarget/exports +++ b/offload/libomptarget/exports @@ -68,8 +68,11 @@ VERS1.0 { omp_get_interop_int; omp_get_interop_name; omp_get_interop_type_desc; - __tgt_interop_get; + __tgt_interop_init; __tgt_interop_use; + __tgt_interop_destroy; + __tgt_interop_get; + __tgt_interop_use60; __tgt_interop_release; __tgt_target_sync; __llvmPushCallConfiguration; diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp index e9b148d8a260..fe1828976590 100644 --- a/offload/libomptarget/interface.cpp +++ b/offload/libomptarget/interface.cpp @@ -30,6 +30,7 @@ #include <cstdint> #include <cstdio> #include <cstdlib> +#include <memory> #ifdef OMPT_SUPPORT using namespace llvm::omp::target::ompt; @@ -165,12 +166,24 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, OMPT_GET_RETURN_ADDRESS);) int Rc = OFFLOAD_SUCCESS; + + // Only allocate AttachInfo for targetDataBegin + std::unique_ptr<AttachInfoTy> AttachInfo; + if (TargetDataFunction == targetDataBegin) + AttachInfo = std::make_unique<AttachInfoTy>(); + Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo, - false /*FromMapper=*/); + AttachInfo.get(), /*FromMapper=*/false); - if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + if (Rc == OFFLOAD_SUCCESS) { + // Process deferred ATTACH entries BEFORE synchronization + if (AttachInfo && !AttachInfo->AttachEntries.empty()) + Rc = processAttachEntries(*DeviceOrErr, *AttachInfo, AsyncInfo); + + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo.synchronize(); + } handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index 5b25d955dd32..69725e77bae0 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -293,7 +293,8 @@ void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, void *ArgMapper, AsyncInfoTy &AsyncInfo, - TargetDataFuncPtrTy TargetDataFunction) { + TargetDataFuncPtrTy TargetDataFunction, + AttachInfoTy *AttachInfo = nullptr) { DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper)); // The mapper function fills up Components. @@ -324,17 +325,193 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, MapperArgsBase.data(), MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), MapperArgNames.data(), /*arg_mappers*/ nullptr, - AsyncInfo, /*FromMapper=*/true); + AsyncInfo, AttachInfo, /*FromMapper=*/true); return Rc; } +/// Returns a buffer of the requested \p Size, to be used as the source for +/// `submitData`. +/// +/// For small buffers (`Size <= sizeof(void*)`), uses \p AsyncInfo's +/// getVoidPtrLocation(). +/// For larger buffers, creates a dynamic buffer which will be eventually +/// deleted by \p AsyncInfo's post-processing callback. +static char *getOrCreateSourceBufferForSubmitData(AsyncInfoTy &AsyncInfo, + int64_t Size) { + constexpr int64_t VoidPtrSize = sizeof(void *); + + if (Size <= VoidPtrSize) { + void *&BufferElement = AsyncInfo.getVoidPtrLocation(); + return reinterpret_cast<char *>(&BufferElement); + } + + // Create a dynamic buffer for larger data and schedule its deletion. + char *DataBuffer = new char[Size]; + AsyncInfo.addPostProcessingFunction([DataBuffer]() { + delete[] DataBuffer; + return OFFLOAD_SUCCESS; + }); + return DataBuffer; +} + +/// Calculates the target pointee base by applying the host +/// pointee begin/base delta to the target pointee begin. +/// +/// ``` +/// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase) +/// ``` +static void *calculateTargetPointeeBase(void *HstPteeBase, void *HstPteeBegin, + void *TgtPteeBegin) { + uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) - + reinterpret_cast<uint64_t>(HstPteeBase); + void *TgtPteeBase = reinterpret_cast<void *>( + reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta); + + DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD + ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n", + DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta); + DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD + "\n", + DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin)); + + return TgtPteeBase; +} + +/// Utility function to perform a pointer attachment operation. +/// +/// For something like: +/// ```cpp +/// int *p; +/// ... +/// #pragma omp target enter data map(to:p[10:10]) +/// ``` +/// +/// for which the attachment operation gets represented using: +/// ``` +/// &p, &p[10], sizeof(p), ATTACH +/// ``` +/// +/// (Hst|Tgt)PtrAddr represents &p +/// (Hst|Tgt)PteeBase represents &p[0] +/// (Hst|Tgt)PteeBegin represents &p[10] +/// +/// This function first computes the expected TgtPteeBase using: +/// `<Select>TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)` +/// +/// and then attaches TgtPteeBase to TgtPtrAddr. +/// +/// \p HstPtrSize represents the size of the pointer p. For C/C++, this +/// should be same as "sizeof(void*)" (say 8). +/// +/// However, for Fortran, pointers/allocatables, which are also eligible for +/// "pointer-attachment", may be implemented using descriptors that contain the +/// address of the pointee in the first 8 bytes, but also contain other +/// information such as lower-bound/upper-bound etc in their subsequent fields. +/// +/// For example, for the following: +/// ```fortran +/// integer, allocatable :: x(:) +/// integer, pointer :: p(:) +/// ... +/// p => x(10: 19) +/// ... +/// !$omp target enter data map(to:p(:)) +/// ``` +/// +/// The map should trigger a pointer-attachment (assuming the pointer-attachment +/// conditions as noted on processAttachEntries are met) between the descriptor +/// for p, and its pointee data. +/// +/// Since only the first 8 bytes of the descriptor contain the address of the +/// pointee, an attachment operation on device descriptors involves: +/// * Setting the first 8 bytes of the device descriptor to point the device +/// address of the pointee. +/// * Copying the remaining information about bounds/offset etc. from the host +/// descriptor to the device descriptor. +/// +/// The function also handles pointer-attachment portion of PTR_AND_OBJ maps, +/// like: +/// ``` +/// &p, &p[10], 10 * sizeof(p[10]), PTR_AND_OBJ +/// ``` +/// by using `sizeof(void*)` as \p HstPtrSize. +static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo, + void **HstPtrAddr, void *HstPteeBase, + void *HstPteeBegin, void **TgtPtrAddr, + void *TgtPteeBegin, int64_t HstPtrSize, + TargetPointerResultTy &PtrTPR) { + assert(PtrTPR.getEntry() && + "Need a valid pointer entry to perform pointer-attachment"); + + constexpr int64_t VoidPtrSize = sizeof(void *); + assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small"); + + void *TgtPteeBase = + calculateTargetPointeeBase(HstPteeBase, HstPteeBegin, TgtPteeBegin); + + // Add shadow pointer tracking + if (!PtrTPR.getEntry()->addShadowPointer( + ShadowPtrInfoTy{HstPtrAddr, TgtPtrAddr, TgtPteeBase, HstPtrSize})) { + DP("Pointer " DPxMOD " is already attached to " DPxMOD "\n", + DPxPTR(TgtPtrAddr), DPxPTR(TgtPteeBase)); + return OFFLOAD_SUCCESS; + } + + DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(TgtPtrAddr), + DPxPTR(TgtPteeBase)); + + // Lambda to handle submitData result and perform final steps. + auto HandleSubmitResult = [&](int SubmitResult) -> int { + if (SubmitResult != OFFLOAD_SUCCESS) { + REPORT("Failed to update pointer on device.\n"); + return OFFLOAD_FAIL; + } + + if (PtrTPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) != + OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return OFFLOAD_SUCCESS; + }; + + // Get a buffer to be used as the source for data submission. + char *SrcBuffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize); + + // The pointee's address should occupy the first VoidPtrSize bytes + // irrespective of HstPtrSize. + std::memcpy(SrcBuffer, &TgtPteeBase, VoidPtrSize); + + // For larger "pointers" (e.g., Fortran descriptors), copy remaining + // descriptor fields from the host descriptor into the buffer. + if (HstPtrSize > VoidPtrSize) { + uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; + void *HstDescriptorFieldsAddr = + reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize; + std::memcpy(SrcBuffer + VoidPtrSize, HstDescriptorFieldsAddr, + HstDescriptorFieldsSize); + + DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD + ") (pointer + %" PRId64 " additional bytes from host descriptor " DPxMOD + ")\n", + HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize, + DPxPTR(HstDescriptorFieldsAddr)); + } + + // Submit the populated source buffer to device. + int SubmitResult = Device.submitData(TgtPtrAddr, SrcBuffer, HstPtrSize, + AsyncInfo, PtrTPR.getEntry()); + return HandleSubmitResult(SubmitResult); +} + /// Internal function to do the mapping and transfer the data to the device int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, - bool FromMapper) { + AttachInfoTy *AttachInfo, bool FromMapper) { + assert(AttachInfo && "AttachInfo must be available for targetDataBegin for " + "handling ATTACH map-types."); // process each input. for (int32_t I = 0; I < ArgNum; ++I) { // Ignore private variables and arrays - there is no mapping for them. @@ -352,7 +529,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, - targetDataBegin); + targetDataBegin, AttachInfo); if (Rc != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" @@ -369,6 +546,25 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, int64_t DataSize = ArgSizes[I]; map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; + // ATTACH map-types are supposed to be handled after all mapping for the + // construct is done. Defer their processing. + if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) { + const bool IsCorrespondingPointerInit = + (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE); + // We don't need to keep track of PRIVATE | ATTACH entries. They + // represent corresponding-pointer-initialization, and are handled + // similar to firstprivate (PRIVATE | TO) entries by + // PrivateArgumentManager. + if (!IsCorrespondingPointerInit) + AttachInfo->AttachEntries.emplace_back( + /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin, + /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I], + /*PointeeName=*/HstPtrName); + + DP("Deferring ATTACH map-type processing for argument %d\n", I); + continue; + } + // Adjust for proper alignment if this is a combined entry (for structs). // Look at the next argument - if that is MEMBER_OF this one, then this one // is a combined entry. @@ -434,13 +630,18 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, : "device failure or illegal mapping"); return OFFLOAD_FAIL; } + + // Track new allocation, for eventual use in attachment decision-making. + if (PointerTpr.Flags.IsNewEntry && !IsHostPtr) + AttachInfo->NewAllocations[HstPtrBase] = sizeof(void *); + DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin), (PointerTpr.Flags.IsNewEntry ? "" : " not")); PointerHstPtrBegin = HstPtrBase; // modify current entry. - HstPtrBase = *(void **)HstPtrBase; + HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); // No need to update pointee ref count for the first element of the // subelement that comes from mapper. UpdateRef = @@ -464,6 +665,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, : "device failure or illegal mapping"); return OFFLOAD_FAIL; } + + // Track new allocation, for eventual use in attachment decision-making. + if (TPR.Flags.IsNewEntry && !IsHostPtr && TgtPtrBegin) + AttachInfo->NewAllocations[HstPtrBegin] = DataSize; + DP("There are %" PRId64 " bytes allocated at target address " DPxMOD " - is%s new\n", DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); @@ -476,30 +682,13 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, } if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { - - uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; - void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); - - if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{ - (void **)PointerHstPtrBegin, HstPtrBase, - (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) { - DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", - DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); - - void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); - TgtPtrBase = ExpectedTgtPtrBase; - - int Ret = - Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *), - AsyncInfo, PointerTpr.getEntry()); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Copying data to device failed.\n"); - return OFFLOAD_FAIL; - } - if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) != - OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - } + int Ret = performPointerAttachment( + Device, AsyncInfo, reinterpret_cast<void **>(PointerHstPtrBegin), + HstPtrBase, HstPtrBegin, + reinterpret_cast<void **>(PointerTgtPtrBegin), TgtPtrBegin, + sizeof(void *), PointerTpr); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; } // Check if variable can be used on the device: @@ -515,6 +704,189 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, return OFFLOAD_SUCCESS; } +/// Process deferred ATTACH map entries collected during targetDataBegin. +/// +/// From OpenMP's perspective, when mapping something that has a base pointer, +/// such as: +/// ```cpp +/// int *p; +/// #pragma omp enter target data map(to: p[10:20]) +/// ``` +/// +/// a pointer-attachment between p and &p[10] should occur if both p and +/// p[10] are present on the device after doing all allocations for all maps +/// on the construct, and one of the following is true: +/// +/// * The pointer p was newly allocated while handling the construct +/// * The pointee p[10:20] was newly allocated while handling the construct +/// * attach(always) map-type modifier was specified (OpenMP 6.1) +/// +/// That's why we collect all attach entries and new memory allocations during +/// targetDataBegin, and use that information to make the decision of whether +/// to perform a pointer-attachment or not here, after maps have been handled. +/// +/// Additionally, once we decide that a pointer-attachment should be performed, +/// we need to make sure that it happens after any previously submitted data +/// transfers have completed, to avoid the possibility of the pending transfers +/// clobbering the attachment. For example: +/// +/// ```cpp +/// int *p = ...; +/// int **pp = &p; +/// map(to: pp[0], p[0]) +/// ``` +/// +/// Which would be represented by: +/// ``` +/// &pp[0], &pp[0], sizeof(pp[0]), TO (1) +/// &p[0], &p[0], sizeof(p[0]), TO (2) +/// +/// &pp, &pp[0], sizeof(pp), ATTACH (3) +/// &p, &p[0], sizeof(p), ATTACH (4) +/// ``` +/// +/// (4) and (1) are both trying to modify the device memory corresponding to +/// `&p`. So, if we decide that (4) should do an attachment, we also need to +/// ensure that (4) happens after (1) is complete. +/// +/// For this purpose, we insert a data_fence before the first +/// pointer-attachment, (3), to ensure that all pending transfers finish first. +int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo, + AsyncInfoTy &AsyncInfo) { + // Report all tracked allocations from both main loop and ATTACH processing + if (!AttachInfo.NewAllocations.empty()) { + DP("Tracked %u total new allocations:\n", + (unsigned)AttachInfo.NewAllocations.size()); + for ([[maybe_unused]] const auto &Alloc : AttachInfo.NewAllocations) { + DP(" Host ptr: " DPxMOD ", Size: %" PRId64 " bytes\n", + DPxPTR(Alloc.first), Alloc.second); + } + } + + if (AttachInfo.AttachEntries.empty()) + return OFFLOAD_SUCCESS; + + DP("Processing %zu deferred ATTACH map entries\n", + AttachInfo.AttachEntries.size()); + + int Ret = OFFLOAD_SUCCESS; + bool IsFirstPointerAttachment = true; + for (size_t EntryIdx = 0; EntryIdx < AttachInfo.AttachEntries.size(); + ++EntryIdx) { + const auto &AttachEntry = AttachInfo.AttachEntries[EntryIdx]; + + void **HstPtr = reinterpret_cast<void **>(AttachEntry.PointerBase); + + void *HstPteeBase = *HstPtr; + void *HstPteeBegin = AttachEntry.PointeeBegin; + + int64_t PtrSize = AttachEntry.PointerSize; + int64_t MapType = AttachEntry.MapType; + + DP("Processing ATTACH entry %zu: HstPtr=" DPxMOD ", HstPteeBegin=" DPxMOD + ", Size=%" PRId64 ", Type=0x%" PRIx64 "\n", + EntryIdx, DPxPTR(HstPtr), DPxPTR(HstPteeBegin), PtrSize, MapType); + + const bool IsAttachAlways = MapType & OMP_TGT_MAPTYPE_ALWAYS; + + // Lambda to check if a pointer was newly allocated + auto WasNewlyAllocated = [&](void *Ptr, const char *PtrName) { + bool IsNewlyAllocated = + llvm::any_of(AttachInfo.NewAllocations, [&](const auto &Alloc) { + void *AllocPtr = Alloc.first; + int64_t AllocSize = Alloc.second; + return Ptr >= AllocPtr && + Ptr < reinterpret_cast<void *>( + reinterpret_cast<char *>(AllocPtr) + AllocSize); + }); + DP("Attach %s " DPxMOD " was newly allocated: %s\n", PtrName, DPxPTR(Ptr), + IsNewlyAllocated ? "yes" : "no"); + return IsNewlyAllocated; + }; + + // Only process ATTACH if either the pointee or the pointer was newly + // allocated, or the ALWAYS flag is set. + if (!IsAttachAlways && !WasNewlyAllocated(HstPteeBegin, "pointee") && + !WasNewlyAllocated(HstPtr, "pointer")) { + DP("Skipping ATTACH entry %zu: neither pointer nor pointee was newly " + "allocated and no ALWAYS flag\n", + EntryIdx); + continue; + } + + // Lambda to perform target pointer lookup and validation + auto LookupTargetPointer = + [&](void *Ptr, int64_t Size, + const char *PtrType) -> std::optional<TargetPointerResultTy> { + // ATTACH map-type does not change ref-count, or do any allocation + // We just need to do a lookup for the pointer/pointee. + TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin( + Ptr, Size, /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false, /*MustContain=*/true); + + DP("Attach %s lookup - IsPresent=%s, IsHostPtr=%s\n", PtrType, + TPR.isPresent() ? "yes" : "no", + TPR.Flags.IsHostPointer ? "yes" : "no"); + + if (!TPR.isPresent()) { + DP("Skipping ATTACH entry %zu: %s not present on device\n", EntryIdx, + PtrType); + return std::nullopt; + } + if (TPR.Flags.IsHostPointer) { + DP("Skipping ATTACH entry %zu: device version of the %s is a host " + "pointer.\n", + EntryIdx, PtrType); + return std::nullopt; + } + + return TPR; + }; + + // Get device version of the pointee (e.g., &p[10]) first, as we can + // release its TPR after extracting the pointer value. + void *TgtPteeBegin = [&]() -> void * { + if (auto PteeTPROpt = LookupTargetPointer(HstPteeBegin, 0, "pointee")) + return PteeTPROpt->TargetPointer; + return nullptr; + }(); + + if (!TgtPteeBegin) + continue; + + // Get device version of the pointer (e.g., &p) next. We need to keep its + // TPR for use in shadow-pointer handling during pointer-attachment. + auto PtrTPROpt = LookupTargetPointer(HstPtr, PtrSize, "pointer"); + if (!PtrTPROpt) + continue; + TargetPointerResultTy &PtrTPR = *PtrTPROpt; + void **TgtPtrBase = reinterpret_cast<void **>(PtrTPR.TargetPointer); + + // Insert a data-fence before the first pointer-attachment. + if (IsFirstPointerAttachment) { + IsFirstPointerAttachment = false; + DP("Inserting a data fence before the first pointer attachment.\n"); + Ret = Device.dataFence(AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Failed to insert data fence.\n"); + return OFFLOAD_FAIL; + } + } + + // Do the pointer-attachment, i.e. update the device pointer to point to + // device pointee. + Ret = performPointerAttachment(Device, AsyncInfo, HstPtr, HstPteeBase, + HstPteeBegin, TgtPtrBase, TgtPteeBegin, + PtrSize, PtrTPR); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + DP("ATTACH entry %zu processed successfully\n", EntryIdx); + } + + return OFFLOAD_SUCCESS; +} + namespace { /// This structure contains information to deallocate a target pointer, aka. /// used to fix up the shadow map and potentially delete the entry from the @@ -584,17 +956,29 @@ postProcessingTargetDataEnd(DeviceTy *Device, DelEntry = false; } - // If we copied back to the host a struct/array containing pointers, - // we need to restore the original host pointer values from their - // shadow copies. If the struct is going to be deallocated, remove any - // remaining shadow pointer entries for this struct. + // If we copied back to the host a struct/array containing pointers, or + // Fortran descriptors (which are larger than a "void *"), we need to + // restore the original host pointer/descriptor values from their shadow + // copies. If the struct is going to be deallocated, remove any remaining + // shadow pointer entries for this struct. const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM; if (HasFrom) { Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) { - *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; - DP("Restoring original host pointer value " DPxMOD " for host " - "pointer " DPxMOD "\n", - DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); + constexpr int64_t VoidPtrSize = sizeof(void *); + if (ShadowPtr.PtrSize > VoidPtrSize) { + DP("Restoring host descriptor " DPxMOD + " to its original content (%" PRId64 + " bytes), containing pointee address " DPxMOD "\n", + DPxPTR(ShadowPtr.HstPtrAddr), ShadowPtr.PtrSize, + DPxPTR(ShadowPtr.HstPtrContent.data())); + } else { + DP("Restoring host pointer " DPxMOD " to its original value " DPxMOD + "\n", + DPxPTR(ShadowPtr.HstPtrAddr), + DPxPTR(ShadowPtr.HstPtrContent.data())); + } + std::memcpy(ShadowPtr.HstPtrAddr, ShadowPtr.HstPtrContent.data(), + ShadowPtr.PtrSize); return OFFLOAD_SUCCESS; }); } @@ -624,7 +1008,8 @@ postProcessingTargetDataEnd(DeviceTy *Device, int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { + void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo, bool FromMapper) { int Ret = OFFLOAD_SUCCESS; auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>(); // process each input. @@ -635,6 +1020,14 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) continue; + // Ignore ATTACH entries - they should only be honored on map-entering + // directives. They may be encountered here while handling the "end" part of + // "#pragma omp target". + if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) { + DP("Ignoring ATTACH entry %d in targetDataEnd\n", I); + continue; + } + if (ArgMappers && ArgMappers[I]) { // Instead of executing the regular path of targetDataEnd, call the // targetDataMapper variant which will call targetDataEnd again @@ -798,12 +1191,22 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, if (TPR.getEntry()) { int Ret = TPR.getEntry()->foreachShadowPointerInfo( [&](ShadowPtrInfoTy &ShadowPtr) { - DP("Restoring original target pointer value " DPxMOD " for target " - "pointer " DPxMOD "\n", - DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr)); + constexpr int64_t VoidPtrSize = sizeof(void *); + if (ShadowPtr.PtrSize > VoidPtrSize) { + DP("Restoring target descriptor " DPxMOD + " to its original content (%" PRId64 + " bytes), containing pointee address " DPxMOD "\n", + DPxPTR(ShadowPtr.TgtPtrAddr), ShadowPtr.PtrSize, + DPxPTR(ShadowPtr.TgtPtrContent.data())); + } else { + DP("Restoring target pointer " DPxMOD + " to its original value " DPxMOD "\n", + DPxPTR(ShadowPtr.TgtPtrAddr), + DPxPTR(ShadowPtr.TgtPtrContent.data())); + } Ret = Device.submitData(ShadowPtr.TgtPtrAddr, - (void *)&ShadowPtr.TgtPtrVal, - sizeof(void *), AsyncInfo); + ShadowPtr.TgtPtrContent.data(), + ShadowPtr.PtrSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -828,15 +1231,26 @@ static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, } // Wait for device-to-host memcopies for whole struct to complete, - // before restoring the correct host pointer. + // before restoring the correct host pointer/descriptor. if (auto *Entry = TPR.getEntry()) { AsyncInfo.addPostProcessingFunction([=]() -> int { int Ret = Entry->foreachShadowPointerInfo( [&](const ShadowPtrInfoTy &ShadowPtr) { - *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; - DP("Restoring original host pointer value " DPxMOD - " for host pointer " DPxMOD "\n", - DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); + constexpr int64_t VoidPtrSize = sizeof(void *); + if (ShadowPtr.PtrSize > VoidPtrSize) { + DP("Restoring host descriptor " DPxMOD + " to its original content (%" PRId64 + " bytes), containing pointee address " DPxMOD "\n", + DPxPTR(ShadowPtr.HstPtrAddr), ShadowPtr.PtrSize, + DPxPTR(ShadowPtr.HstPtrContent.data())); + } else { + DP("Restoring host pointer " DPxMOD + " to its original value " DPxMOD "\n", + DPxPTR(ShadowPtr.HstPtrAddr), + DPxPTR(ShadowPtr.HstPtrContent.data())); + } + std::memcpy(ShadowPtr.HstPtrAddr, ShadowPtr.HstPtrContent.data(), + ShadowPtr.PtrSize); return OFFLOAD_SUCCESS; }); Entry->unlock(); @@ -900,7 +1314,8 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { + void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo, bool FromMapper) { // process each input. for (int32_t I = 0; I < ArgNum; ++I) { if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || @@ -1013,13 +1428,24 @@ class PrivateArgumentManagerTy { uint32_t Padding; /// Host pointer name map_var_info_t HstPtrName = nullptr; + /// For corresponding-pointer-initialization: host pointee base address. + void *HstPteeBase = nullptr; + /// For corresponding-pointer-initialization: host pointee begin address. + void *HstPteeBegin = nullptr; + /// Whether this argument needs corresponding-pointer-initialization. + bool IsCorrespondingPointerInit = false; FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size, uint32_t Alignment, uint32_t Padding, - map_var_info_t HstPtrName = nullptr) + map_var_info_t HstPtrName = nullptr, + void *HstPteeBase = nullptr, + void *HstPteeBegin = nullptr, + bool IsCorrespondingPointerInit = false) : HstPtrBegin(reinterpret_cast<char *>(HstPtr)), HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment), - Size(Size), Padding(Padding), HstPtrName(HstPtrName) {} + Size(Size), Padding(Padding), HstPtrName(HstPtrName), + HstPteeBase(HstPteeBase), HstPteeBegin(HstPteeBegin), + IsCorrespondingPointerInit(IsCorrespondingPointerInit) {} }; /// A vector of target pointers for all private arguments @@ -1037,6 +1463,153 @@ class PrivateArgumentManagerTy { /// A pointer to a \p AsyncInfoTy object AsyncInfoTy &AsyncInfo; + /// \returns the value of the target pointee's base to be used for + /// corresponding-pointer-initialization. + void *getTargetPointeeBaseForCorrespondingPointerInitialization( + void *HstPteeBase, void *HstPteeBegin) { + // See if the pointee's begin address has corresponding storage on device. + void *TgtPteeBegin = [&]() -> void * { + if (!HstPteeBegin) { + DP("Corresponding-pointer-initialization: pointee begin address is " + "null\n"); + return nullptr; + } + + return Device.getMappingInfo() + .getTgtPtrBegin(HstPteeBegin, /*Size=*/0, /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false) + .TargetPointer; + }(); + + // If it does, we calculate target pointee base using it, and return it. + // Otherwise, we retain the host pointee's base as the target pointee base + // of the initialized pointer. It's the user's responsibility to ensure + // that if a lookup fails, the host pointee is accessible on the device. + return TgtPteeBegin ? calculateTargetPointeeBase(HstPteeBase, HstPteeBegin, + TgtPteeBegin) + : HstPteeBase; + } + + /// Initialize the source buffer for corresponding-pointer-initialization. + /// + /// It computes and stores the target pointee base address (or the host + /// pointee's base address, if lookup of target pointee fails) to the first + /// `sizeof(void*)` bytes of \p Buffer, and for larger pointers + /// (Fortran descriptors), the remaining fields of the host descriptor + /// \p HstPtr after those `sizeof(void*)` bytes. + /// + /// Corresponding-pointer-initialization represents the initialization of the + /// private version of a base-pointer/referring-pointer on a target construct. + /// + /// For example, for the following test: + /// ```cpp + /// int x[10]; + /// int *px = &x[0]; + /// ... + /// #pragma omp target data map(tofrom:px) + /// { + /// int **ppx = omp_get_mapped_ptr(&px, omp_get_default_device()); + /// #pragma omp target map(tofrom:px[1]) is_device_ptr(ppx) + /// { + /// foo(px, ppx); + /// } + /// } + /// ``` + /// The following shows a possible way to implement the mapping of `px`, + /// which is pre-determined firstprivate and should get initialized + /// via corresponding-pointer-initialization: + /// + /// (A) Possible way to implement the above with PRIVATE | ATTACH: + /// ```llvm + /// ; maps for px: + /// ; &px[0], &px[1], sizeof(px[1]), TO | FROM // (1) + /// ; &px, &px[1], sizeof(px), ATTACH // (2) + /// ; &px, &px[1], sizeof(px), PRIVATE | ATTACH | PARAM // (3) + /// call... @__omp_outlined...(ptr %px, ptr %ppx) + /// define ... @__omp_outlined(ptr %px, ptr %ppx) {... + /// foo(%px, %ppx) + /// ...} + /// ``` + /// `(1)` maps the pointee `px[1]. + /// `(2)` attaches it to the mapped version of `px`. It can be controlled by + /// the user based on the `attach(auto/always/never)` map-type modifier. + /// `(3)` privatizes and initializes the private pointer `px`, and passes it + /// into the kernel as the argument `%px`. Can be skipped if `px` is not + /// referenced in the target construct. + /// + /// While this method is not too beneficial compared to just doing the + /// initialization in the body of the kernel, like: + /// (B) Possible way to implement the above without PRIVATE | ATTACH: + /// ```llvm + /// ; maps for px: + /// ; &px[0], &px[1], sizeof(px[1]), TO | FROM | PARAM // (4) + /// ; &px, &px[1], sizeof(px), ATTACH // (5) + /// call... @__omp_outlined...(ptr %px0, ptr %ppx) + /// define ... __omp_outlined...(ptr %px0, ptr %ppx) { + /// %px = alloca ptr; + /// store ptr %px0, ptr %px + /// foo(%px, %ppx) + /// } + /// ``` + /// + /// (B) is not so convenient for Fortran descriptors, because in + /// addition to the lookup, the remaining fields of the descriptor have + /// to be passed into the kernel to initialize the private copy, which + /// makes (A) a cleaner option for them. e.g. + /// ```f90 + /// integer, pointer :: p(:) + /// !$omp target map(p(1)) + /// ``` + /// + /// (C) Possible mapping for the above Fortran test using PRIVATE | ATTACH: + /// ```llvm + /// ; maps for p: + /// ; &p(1), &p(1), sizeof(p(1)), TO | FROM + /// ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), ATTACH + /// ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), PRIVATE | ATTACH | PARAM + /// call... @__omp_outlined...(ptr %ref_ptr_of_p) + void initBufferForCorrespondingPointerInitialization(char *Buffer, + void *HstPtr, + int64_t HstPtrSize, + void *HstPteeBase, + void *HstPteeBegin) { + constexpr int64_t VoidPtrSize = sizeof(void *); + assert(HstPtrSize >= VoidPtrSize && + "corresponding-pointer-initialization: pointer size is too small"); + + void *TgtPteeBase = + getTargetPointeeBaseForCorrespondingPointerInitialization(HstPteeBase, + HstPteeBegin); + + // Store the target pointee base address to the first VoidPtrSize bytes + DP("Initializing corresponding-pointer-initialization source buffer " + "for " DPxMOD ", with pointee base " DPxMOD "\n", + DPxPTR(HstPtr), DPxPTR(TgtPteeBase)); + std::memcpy(Buffer, &TgtPteeBase, VoidPtrSize); + if (HstPtrSize <= VoidPtrSize) + return; + + // For Fortran descriptors, copy the remaining descriptor fields from host + uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; + void *HstDescriptorFieldsAddr = static_cast<char *>(HstPtr) + VoidPtrSize; + DP("Copying %" PRId64 + " bytes of descriptor fields into corresponding-pointer-initialization " + "buffer at offset %" PRId64 ", from " DPxMOD "\n", + HstDescriptorFieldsSize, VoidPtrSize, DPxPTR(HstDescriptorFieldsAddr)); + std::memcpy(Buffer + VoidPtrSize, HstDescriptorFieldsAddr, + HstDescriptorFieldsSize); + } + + /// Helper function to create and initialize a buffer to be used as the source + /// for corresponding-pointer-initialization. + void *createAndInitSourceBufferForCorrespondingPointerInitialization( + void *HstPtr, int64_t HstPtrSize, void *HstPteeBase, void *HstPteeBegin) { + char *Buffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize); + initBufferForCorrespondingPointerInitialization(Buffer, HstPtr, HstPtrSize, + HstPteeBase, HstPteeBegin); + return Buffer; + } + // TODO: What would be the best value here? Should we make it configurable? // If the size is larger than this threshold, we will allocate and transfer it // immediately instead of packing it. @@ -1051,7 +1624,9 @@ public: int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, map_var_info_t HstPtrName = nullptr, - const bool AllocImmediately = false) { + const bool AllocImmediately = false, void *HstPteeBase = nullptr, + void *HstPteeBegin = nullptr, + bool IsCorrespondingPointerInit = false) { // If the argument is not first-private, or its size is greater than a // predefined threshold, we will allocate memory and issue the transfer // immediately. @@ -1074,9 +1649,19 @@ public: // If first-private, copy data from host if (IsFirstPrivate) { DP("Submitting firstprivate data to the device.\n"); - int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); + + // The source value used for corresponding-pointer-initialization + // is different vs regular firstprivates. + void *DataSource = + IsCorrespondingPointerInit + ? createAndInitSourceBufferForCorrespondingPointerInitialization( + HstPtr, ArgSize, HstPteeBase, HstPteeBegin) + : HstPtr; + int Ret = Device.submitData(TgtPtr, DataSource, ArgSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { - DP("Copying data to device failed, failed.\n"); + DP("Copying %s data to device failed.\n", + IsCorrespondingPointerInit ? "corresponding-pointer-initialization" + : "firstprivate"); return OFFLOAD_FAIL; } } @@ -1122,8 +1707,10 @@ public: } } - FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, - StartAlignment, Padding, HstPtrName); + FirstPrivateArgInfo.emplace_back( + TgtArgsIndex, HstPtr, ArgSize, StartAlignment, Padding, HstPtrName, + HstPteeBase, HstPteeBegin, IsCorrespondingPointerInit); + FirstPrivateArgSize += Padding + ArgSize; } @@ -1142,7 +1729,13 @@ public: for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { // First pad the pointer as we (have to) pad it on the device too. Itr = std::next(Itr, Info.Padding); - std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); + + if (Info.IsCorrespondingPointerInit) + initBufferForCorrespondingPointerInitialization( + &*Itr, Info.HstPtrBegin, Info.Size, Info.HstPteeBase, + Info.HstPteeBegin); + else + std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); Itr = std::next(Itr, Info.Size); } // Allocate target memory @@ -1213,13 +1806,27 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, if (!DeviceOrErr) FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + // Create AttachInfo for tracking any ATTACH entries, or new-allocations + // when handling the "begin" mapping for a target constructs. + AttachInfoTy AttachInfo; + int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers, AsyncInfo); + ArgTypes, ArgNames, ArgMappers, AsyncInfo, + &AttachInfo, false /*FromMapper=*/); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin failed, abort target.\n"); return OFFLOAD_FAIL; } + // Process collected ATTACH entries + if (!AttachInfo.AttachEntries.empty()) { + Ret = processAttachEntries(*DeviceOrErr, AttachInfo, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Failed to process ATTACH entries.\n"); + return OFFLOAD_FAIL; + } + } + // List of (first-)private arrays allocated for this target region SmallVector<int> TgtArgsPositions(ArgNum, -1); @@ -1284,8 +1891,40 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, TgtPtrBegin = HstPtrBase; TgtBaseOffset = 0; } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { + // For cases like: + // ``` + // int *p = ...; + // #pragma omp target map(p[0:10]) + // ``` + // `p` is predetermined firstprivate on the target construct, and the + // method to determine the initial value of the private copy on the + // device is called "corresponding-pointer-initialization". + // + // Such firstprivate pointers that need + // corresponding-pointer-initialization are represented using the + // `PRIVATE | ATTACH` map-types, in contrast to regular firstprivate + // entries, which use `PRIVATE | TO`. The structure of these + // `PRIVATE | ATTACH` entries is the same as the non-private + // `ATTACH` entries used to represent pointer-attachments, i.e.: + // ``` + // &hst_ptr_base/begin, &hst_ptee_begin, sizeof(hst_ptr) + // ``` + const bool IsAttach = (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH); + void *HstPteeBase = nullptr; + void *HstPteeBegin = nullptr; + if (IsAttach) { + // For corresponding-pointer-initialization, Args[I] is HstPteeBegin, + // and ArgBases[I] is both HstPtrBase/HstPtrBegin. + HstPteeBase = *reinterpret_cast<void **>(HstPtrBase); + HstPteeBegin = Args[I]; + HstPtrBegin = ArgBases[I]; + } TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; - const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); + // Corresponding-pointer-initialization is a special case of firstprivate, + // since it also involves initializing the private pointer. + const bool IsFirstPrivate = + (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) || IsAttach; + // If there is a next argument and it depends on the current one, we need // to allocate the private memory immediately. If this is not the case, // then the argument can be marked for optimization and packed with the @@ -1294,9 +1933,11 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); Ret = PrivateArgumentManager.addArg( HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, - TgtArgs.size(), HstPtrName, AllocImmediately); + /*TgtArgsIndex=*/TgtArgs.size(), HstPtrName, AllocImmediately, + HstPteeBase, HstPteeBegin, /*IsCorrespondingPointerInit=*/IsAttach); if (Ret != OFFLOAD_SUCCESS) { - REPORT("Failed to process %sprivate argument " DPxMOD "\n", + REPORT("Failed to process %s%sprivate argument " DPxMOD "\n", + IsAttach ? "corresponding-pointer-initialization " : "", (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); return OFFLOAD_FAIL; } diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h index 0b3d54599048..90e5e1780e66 100644 --- a/offload/libomptarget/private.h +++ b/offload/libomptarget/private.h @@ -55,7 +55,14 @@ printKernelArguments(const ident_t *Loc, const int64_t DeviceId, const char *Type = nullptr; const char *Implicit = (ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT) ? "(implicit)" : ""; - if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) + + if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH && + ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS) + Type = "attach:always"; + else if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) + Type = "attach"; + else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && + ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) Type = "tofrom"; else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) Type = "to"; diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h index 61f680bab3a0..ad135f72fff1 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h @@ -71,9 +71,15 @@ typedef enum { } hsa_isa_info_t; typedef enum { + HSA_MACHINE_MODEL_SMALL = 0, + HSA_MACHINE_MODEL_LARGE = 1 +} hsa_machine_model_t; + +typedef enum { HSA_AGENT_INFO_NAME = 0, HSA_AGENT_INFO_VENDOR_NAME = 1, HSA_AGENT_INFO_FEATURE = 2, + HSA_AGENT_INFO_MACHINE_MODEL = 3, HSA_AGENT_INFO_PROFILE = 4, HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h index 3117763e3589..29cfe78082db 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h @@ -67,6 +67,7 @@ typedef enum hsa_amd_agent_info_s { HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002, HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003, + HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008, HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009, HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A, HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B, diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 796182075ff3..a7723b859881 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -99,7 +99,7 @@ namespace hsa_utils { /// Iterate elements using an HSA iterate function. Do not use this function /// directly but the specialized ones below instead. template <typename ElemTy, typename IterFuncTy, typename CallbackTy> -hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) { +static hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) { auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); return (*Unwrapped)(Elem); @@ -111,7 +111,8 @@ hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) { /// use this function directly but the specialized ones below instead. template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy, typename CallbackTy> -hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { +static hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, + CallbackTy Cb) { auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); return (*Unwrapped)(Elem); @@ -123,7 +124,8 @@ hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { /// use this function directly but the specialized ones below instead. template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy, typename IterFuncArgTy, typename CallbackTy> -hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { +static hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, + CallbackTy Cb) { auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t { CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); return (*Unwrapped)(Elem1, Elem2); @@ -132,21 +134,21 @@ hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { } /// Iterate agents. -template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) { +template <typename CallbackTy> static Error iterateAgents(CallbackTy Callback) { hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback); return Plugin::check(Status, "error in hsa_iterate_agents: %s"); } /// Iterate ISAs of an agent. template <typename CallbackTy> -Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) { +static Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) { hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb); return Plugin::check(Status, "error in hsa_agent_iterate_isas: %s"); } /// Iterate memory pools of an agent. template <typename CallbackTy> -Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { +static Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { hsa_status_t Status = iterate<hsa_amd_memory_pool_t>( hsa_amd_agent_iterate_memory_pools, Agent, Cb); return Plugin::check(Status, @@ -155,10 +157,12 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { /// Dispatches an asynchronous memory copy. /// Enables different SDMA engines for the dispatch in a round-robin fashion. -Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, - const void *Src, hsa_agent_t SrcAgent, size_t Size, - uint32_t NumDepSignals, const hsa_signal_t *DepSignals, - hsa_signal_t CompletionSignal) { +static Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, + hsa_agent_t DstAgent, const void *Src, + hsa_agent_t SrcAgent, size_t Size, + uint32_t NumDepSignals, + const hsa_signal_t *DepSignals, + hsa_signal_t CompletionSignal) { if (!UseMultipleSdmaEngines) { hsa_status_t S = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size, @@ -193,8 +197,8 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, #endif } -Error getTargetTripleAndFeatures(hsa_agent_t Agent, - SmallVector<SmallString<32>> &Targets) { +static Error getTargetTripleAndFeatures(hsa_agent_t Agent, + SmallVector<SmallString<32>> &Targets) { auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { uint32_t Length; hsa_status_t Status; @@ -419,7 +423,11 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { assert(MemoryManager && "Invalid memory manager"); assert(PtrStorage && "Invalid pointer storage"); - *PtrStorage = MemoryManager->allocate(Size, nullptr); + auto PtrStorageOrErr = MemoryManager->allocate(Size, nullptr); + if (!PtrStorageOrErr) + return PtrStorageOrErr.takeError(); + + *PtrStorage = *PtrStorageOrErr; if (Size && *PtrStorage == nullptr) return Plugin::error(ErrorCode::OUT_OF_RESOURCES, "failure to allocate from AMDGPU memory manager"); @@ -439,15 +447,12 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { private: /// Allocation callback that will be called once the memory manager does not /// have more previously allocated buffers. - void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override; + Expected<void *> allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind) override; /// Deallocation callback that will be called by the memory manager. - int free(void *TgtPtr, TargetAllocTy Kind) override { - if (auto Err = MemoryPool->deallocate(TgtPtr)) { - consumeError(std::move(Err)); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; + Error free(void *TgtPtr, TargetAllocTy Kind) override { + return MemoryPool->deallocate(TgtPtr); } /// The underlying plugin that owns this memory manager. @@ -464,8 +469,8 @@ private: struct AMDGPUDeviceImageTy : public DeviceImageTy { /// Create the AMDGPU image with the id and the target image pointer. AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, - const __tgt_device_image *TgtImage) - : DeviceImageTy(ImageId, Device, TgtImage) {} + std::unique_ptr<MemoryBuffer> &&TgtImage) + : DeviceImageTy(ImageId, Device, std::move(TgtImage)) {} /// Prepare and load the executable corresponding to the image. Error loadExecutable(const AMDGPUDeviceTy &Device); @@ -570,6 +575,16 @@ struct AMDGPUKernelTy : public GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Return maximum block size for maximum occupancy + /// + /// TODO: This needs to be implemented for amdgpu + Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice, + uint64_t DynamicMemSize) const override { + return Plugin::error( + ErrorCode::UNSUPPORTED, + "occupancy calculations for AMDGPU are not yet implemented"); + } + /// Print more elaborate kernel launch info for AMDGPU Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], @@ -914,6 +929,7 @@ private: void *Dst; const void *Src; size_t Size; + size_t NumTimes; }; /// Utility struct holding arguments for freeing buffers to memory managers. @@ -964,9 +980,14 @@ private: StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {} /// Schedule a host memory copy action on the slot. - Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) { + /// + /// Num times will repeat the copy that many times, sequentually in the dest + /// buffer. + Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size, + size_t NumTimes = 1) { Callbacks.emplace_back(memcpyAction); - ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size}; + ActionArgs.emplace_back().MemcpyArgs = + MemcpyArgsTy{Dst, Src, Size, NumTimes}; return Plugin::success(); } @@ -1063,6 +1084,20 @@ private: /// Indicate to spread data transfers across all available SDMAs bool UseMultipleSdmaEngines; + /// Wrapper function for implementing host callbacks + static void CallbackWrapper(AMDGPUSignalTy *InputSignal, + AMDGPUSignalTy *OutputSignal, + void (*Callback)(void *), void *UserData) { + // The wait call will not error in this context. + if (InputSignal) + if (auto Err = InputSignal->wait()) + reportFatalInternalError(std::move(Err)); + + Callback(UserData); + + OutputSignal->signal(); + } + /// Return the current number of asynchronous operations on the stream. uint32_t size() const { return NextSlot; } @@ -1192,7 +1227,11 @@ private: assert(Args->Dst && "Invalid destination buffer"); assert(Args->Src && "Invalid source buffer"); - std::memcpy(Args->Dst, Args->Src, Args->Size); + auto *BasePtr = Args->Dst; + for (size_t I = 0; I < Args->NumTimes; I++) { + std::memcpy(BasePtr, Args->Src, Args->Size); + BasePtr = reinterpret_cast<uint8_t *>(BasePtr) + Args->Size; + } return Plugin::success(); } @@ -1397,7 +1436,8 @@ public: /// manager once the operation completes. Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, uint64_t CopySize, - AMDGPUMemoryManagerTy &MemoryManager) { + AMDGPUMemoryManagerTy &MemoryManager, + size_t NumTimes = 1) { // Retrieve available signals for the operation's outputs. AMDGPUSignalTy *OutputSignals[2] = {}; if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) @@ -1419,7 +1459,8 @@ public: // The std::memcpy is done asynchronously using an async handler. We store // the function's information in the action but it is not actually a // post action. - if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize)) + if (auto Err = + Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes)) return Err; // Make changes on this slot visible to the async handler's thread. @@ -1440,7 +1481,11 @@ public: std::tie(Curr, InputSignal) = consume(OutputSignal); } else { // All preceding operations completed, copy the memory synchronously. - std::memcpy(Inter, Src, CopySize); + auto *InterPtr = Inter; + for (size_t I = 0; I < NumTimes; I++) { + std::memcpy(InterPtr, Src, CopySize); + InterPtr = reinterpret_cast<uint8_t *>(InterPtr) + CopySize; + } // Return the second signal because it will not be used. OutputSignals[1]->decreaseUseCount(); @@ -1457,11 +1502,11 @@ public: if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + Agent, CopySize * NumTimes, 1, + &InputSignalRaw, OutputSignal->get()); } return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 0, nullptr, + Agent, CopySize * NumTimes, 0, nullptr, OutputSignal->get()); } @@ -1495,6 +1540,31 @@ public: OutputSignal->get()); } + Error pushHostCallback(void (*Callback)(void *), void *UserData) { + // Retrieve an available signal for the operation's output. + AMDGPUSignalTy *OutputSignal = nullptr; + if (auto Err = SignalManager.getResource(OutputSignal)) + return Err; + OutputSignal->reset(); + OutputSignal->increaseUseCount(); + + AMDGPUSignalTy *InputSignal; + { + std::lock_guard<std::mutex> Lock(Mutex); + + // Consume stream slot and compute dependencies. + InputSignal = consume(OutputSignal).second; + } + + // "Leaking" the thread here is consistent with other work added to the + // queue. The input and output signals will remain valid until the output is + // signaled. + std::thread(CallbackWrapper, InputSignal, OutputSignal, Callback, UserData) + .detach(); + + return Plugin::success(); + } + /// Synchronize with the stream. The current thread waits until all operations /// are finalized and it performs the pending post actions (i.e., releasing /// intermediate buffers). @@ -1519,6 +1589,9 @@ public: /// actions for that and prior events. Error synchronizeOn(AMDGPUEventTy &Event); + /// Return true if the event from this queue is complete + Expected<bool> isEventComplete(const AMDGPUEventTy &Event); + /// Query the stream and complete pending post actions if operations finished. /// Return whether all the operations completed. This operation does not block /// the calling thread. @@ -1683,6 +1756,18 @@ Error AMDGPUStreamTy::synchronizeOn(AMDGPUEventTy &Event) { return completeUntil(Event.RecordedSlot); } +Expected<bool> AMDGPUStreamTy::isEventComplete(const AMDGPUEventTy &Event) { + std::lock_guard<std::mutex> Lock(Mutex); + assert(Event.RecordedStream == this && "event is for a different stream"); + + if (Event.RecordedSyncCycle < SyncCycle) { + return true; + } + assert(Event.RecordedSyncCycle == SyncCycle && "event is from the future?"); + + return !Slots[Event.RecordedSlot].Signal->load(); +} + struct AMDGPUStreamManagerTy final : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> { using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>; @@ -2080,7 +2165,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(*Image); // Unload the executable of the image. - return AMDImage.unloadExecutable(); + if (auto Err = AMDImage.unloadExecutable()) + return Err; + + // Destroy the associated memory and invalidate the object. + Plugin.free(Image); + return Error::success(); } /// Deinitialize the device and release its resources. @@ -2103,18 +2193,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { virtual Error callGlobalConstructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini")) - Image.setPendingGlobalDtors(); - return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); } virtual Error callGlobalDestructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { - if (Image.hasPendingGlobalDtors()) - return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); - return Plugin::success(); + return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); } uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; } @@ -2241,11 +2325,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { } /// Load the binary image into the device and allocate an image object. - Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage, - int32_t ImageId) override { + Expected<DeviceImageTy *> + loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage, + int32_t ImageId) override { // Allocate and initialize the image object. AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>(); - new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage); + new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, std::move(TgtImage)); // Load the HSA executable. if (Error Err = AMDImage->loadExecutable(*this)) @@ -2255,18 +2340,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { } /// Allocate memory on the device or related to the device. - void *allocate(size_t Size, void *, TargetAllocTy Kind) override; + Expected<void *> allocate(size_t Size, void *, TargetAllocTy Kind) override; /// Deallocate memory on the device or related to the device. - int free(void *TgtPtr, TargetAllocTy Kind) override { + Error free(void *TgtPtr, TargetAllocTy Kind) override { if (TgtPtr == nullptr) - return OFFLOAD_SUCCESS; + return Plugin::success(); AMDGPUMemoryPoolTy *MemoryPool = nullptr; switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: - case TARGET_ALLOC_DEVICE_NON_BLOCKING: MemoryPool = CoarseGrainedMemoryPools[0]; break; case TARGET_ALLOC_HOST: @@ -2277,17 +2361,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { break; } - if (!MemoryPool) { - REPORT("No memory pool for the specified allocation kind\n"); - return OFFLOAD_FAIL; - } + if (!MemoryPool) + return Plugin::error(ErrorCode::OUT_OF_RESOURCES, + "no memory pool for the specified allocation kind"); - if (Error Err = MemoryPool->deallocate(TgtPtr)) { - REPORT("%s\n", toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } + if (auto Err = MemoryPool->deallocate(TgtPtr)) + return Err; - return OFFLOAD_SUCCESS; + return Plugin::success(); } /// Synchronize current thread with the pending operations on the async info. @@ -2537,22 +2618,130 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { getAgent(), (uint64_t)Size); } - /// Initialize the async info for interoperability purposes. + /// Insert a data fence between previous data operations and the following + /// operations. This is a no-op for AMDGPU devices as operations inserted into + /// a queue are in-order. + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + // Fast case, where we can use the 4 byte hsa_amd_memory_fill + if (Size % 4 == 0 && + (PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) { + uint32_t Pattern; + if (PatternSize == 1) { + auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr); + Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24; + } else if (PatternSize == 2) { + auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr); + Pattern = *Word | (*Word << 16); + } else if (PatternSize == 4) { + Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr); + } else { + // Shouldn't be here if the pattern size is outwith those values + llvm_unreachable("Invalid pattern size"); + } + + if (hasPendingWorkImpl(AsyncInfoWrapper)) { + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + struct MemFillArgsTy { + void *Dst; + uint32_t Pattern; + int64_t Size; + }; + auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4}; + auto Fill = [](void *Data) { + MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data); + assert(Args && "Invalid arguments"); + + auto Status = + hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size); + delete Args; + auto Err = + Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"); + if (Err) { + FATAL_MESSAGE(1, "error performing async fill: %s", + toString(std::move(Err)).data()); + } + }; + + // hsa_amd_memory_fill doesn't signal completion using a signal, so use + // the existing host callback logic to handle that instead + return Stream->pushHostCallback(Fill, Args); + } + // If there is no pending work, do the fill synchronously + auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4); + return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"); + } + + // Slow case; allocate an appropriate memory size and enqueue copies + void *PinnedPtr = nullptr; + AMDGPUMemoryManagerTy &PinnedMemoryManager = + HostDevice.getPinnedMemoryManager(); + if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) + return Err; + + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr, + PatternSize, PinnedMemoryManager, + Size / PatternSize); + } + + /// Initialize the async info Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { // TODO: Implement this function. return Plugin::success(); } - /// Initialize the device info for interoperability purposes. - Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { - DeviceInfo->Context = nullptr; + interop_spec_t selectInteropPreference(int32_t InteropType, + int32_t NumPrefers, + interop_spec_t *Prefers) override { + // TODO: update once targetsync is supported + if (InteropType == kmp_interop_type_target) + return interop_spec_t{tgt_fr_hsa, {false, 0}, 0}; + return interop_spec_t{tgt_fr_none, {false, 0}, 0}; + } + + Expected<omp_interop_val_t *> + createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override { + auto *Ret = new omp_interop_val_t( + DeviceId, static_cast<kmp_interop_type_t>(InteropType)); + Ret->fr_id = tgt_fr_hsa; + Ret->vendor_id = omp_vendor_amd; + + // TODO: implement targetsync support - if (!DeviceInfo->Device) - DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle); + Ret->device_info.Platform = nullptr; + Ret->device_info.Device = reinterpret_cast<void *>(Agent.handle); + Ret->device_info.Context = nullptr; + + return Ret; + } + Error releaseInterop(omp_interop_val_t *Interop) override { + if (Interop) + delete Interop; return Plugin::success(); } + Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, + AsyncInfoWrapperTy &AsyncInfo) override { + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfo, Stream)) + return Err; + + return Stream->pushHostCallback(Callback, UserData); + }; + /// Create an event. Error createEventImpl(void **EventPtrStorage) override { AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage); @@ -2591,7 +2780,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { } Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override { - auto Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>(); + auto *Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>(); if (!Stream) return false; @@ -2601,6 +2790,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Query.takeError(); } + Expected<bool> isEventCompleteImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfo) override { + AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); + auto *Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>(); + return Stream && Stream->isEventComplete(*Event); + } + /// Synchronize the current thread with the event. Error syncEventImpl(void *EventPtr) override { AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); @@ -2632,7 +2828,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Product Name", TmpChar); + Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME); Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) @@ -2642,10 +2838,19 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (Status == HSA_STATUS_SUCCESS) Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR); + Info.add("Vendor ID", uint64_t{4130}, "", DeviceInfo::VENDOR_ID); + + hsa_machine_model_t MachineModel; + Status = getDeviceAttrRaw(HSA_AGENT_INFO_MACHINE_MODEL, MachineModel); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Memory Address Size", + uint64_t{MachineModel == HSA_MACHINE_MODEL_SMALL ? 32u : 64u}, + "bits", DeviceInfo::ADDRESS_BITS); + hsa_device_type_t DevType; Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); if (Status == HSA_STATUS_SUCCESS) { - switch (DevType) { + switch (static_cast<int>(DevType)) { case HSA_DEVICE_TYPE_CPU: TmpCharPtr = "CPU"; break; @@ -2692,11 +2897,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); if (Status == HSA_STATUS_SUCCESS) - Info.add("Max Clock Freq", TmpUInt, "MHz"); + Info.add("Max Clock Freq", TmpUInt, "MHz", + DeviceInfo::MAX_CLOCK_FREQUENCY); + + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Max Memory Clock Freq", TmpUInt, "MHz", + DeviceInfo::MEMORY_CLOCK_RATE); Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt); if (Status == HSA_STATUS_SUCCESS) - Info.add("Compute Units", TmpUInt); + Info.add("Compute Units", TmpUInt, "", DeviceInfo::NUM_COMPUTE_UNITS); Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt); if (Status == HSA_STATUS_SUCCESS) @@ -2734,11 +2945,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); if (Status == HSA_STATUS_SUCCESS) - Info.add("Grid Max Size", TmpUInt); + Info.add("Grid Max Size", TmpUInt, "", DeviceInfo::MAX_WORK_SIZE); Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); if (Status == HSA_STATUS_SUCCESS) { - auto &MaxDim = *Info.add("Grid Max Size per Dimension"); + auto &MaxDim = *Info.add("Grid Max Size per Dimension", std::monostate{}, + "", DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION); MaxDim.add("x", GridMaxDim.x); MaxDim.add("y", GridMaxDim.y); MaxDim.add("z", GridMaxDim.z); @@ -2778,7 +2990,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt); if (Status == HSA_STATUS_SUCCESS) - PoolNode.add("Size", TmpSt, "bytes"); + PoolNode.add( + "Size", TmpSt, "bytes", + (Pool->isGlobal() && Pool->isCoarseGrained()) + ? std::optional<DeviceInfo>{DeviceInfo::GLOBAL_MEM_SIZE} + : std::nullopt); Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, TmpBool); @@ -2910,7 +3126,7 @@ private: // Perform a quick check for the named kernel in the image. The kernel // should be created by the 'amdgpu-lower-ctor-dtor' pass. GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) + if (!Handler.isSymbolInImage(*this, Image, KernelName)) return Plugin::success(); // Allocate and construct the AMDGPU kernel. @@ -3461,11 +3677,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { - if (ArgsSize != LaunchParams.Size && - ArgsSize > LaunchParams.Size + getImplicitArgsSize()) - return Plugin::error(ErrorCode::INVALID_ARGUMENT, - "invalid kernel arguments size"); - AMDGPUPluginTy &AMDGPUPlugin = static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin); AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice(); @@ -3551,8 +3762,8 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, return Plugin::success(); // General Info - auto NumGroups = NumBlocks; - auto ThreadsPerGroup = NumThreads; + auto *NumGroups = NumBlocks; + auto *ThreadsPerGroup = NumThreads; // Kernel Arguments Info auto ArgNum = KernelArgs.NumArgs; @@ -3616,14 +3827,13 @@ static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc); } -void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, - TargetAllocTy Kind) { +Expected<void *> AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind) { // Allocate memory from the pool. void *Ptr = nullptr; - if (auto Err = MemoryPool->allocate(Size, &Ptr)) { - consumeError(std::move(Err)); - return nullptr; - } + if (auto Err = MemoryPool->allocate(Size, &Ptr)) + return std::move(Err); + assert(Ptr && "Invalid pointer"); // Get a list of agents that can access this memory pool. @@ -3633,14 +3843,13 @@ void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, [&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); }); // Allow all valid kernel agents to access the allocation. - if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) { - REPORT("%s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) + return std::move(Err); return Ptr; } -void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { +Expected<void *> AMDGPUDeviceTy::allocate(size_t Size, void *, + TargetAllocTy Kind) { if (Size == 0) return nullptr; @@ -3649,7 +3858,6 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: - case TARGET_ALLOC_DEVICE_NON_BLOCKING: MemoryPool = CoarseGrainedMemoryPools[0]; break; case TARGET_ALLOC_HOST: @@ -3660,17 +3868,14 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { break; } - if (!MemoryPool) { - REPORT("No memory pool for the specified allocation kind\n"); - return nullptr; - } + if (!MemoryPool) + return Plugin::error(ErrorCode::UNSUPPORTED, + "no memory pool for the specified allocation kind"); // Allocate from the corresponding memory pool. void *Alloc = nullptr; - if (Error Err = MemoryPool->allocate(Size, &Alloc)) { - REPORT("%s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = MemoryPool->allocate(Size, &Alloc)) + return std::move(Err); if (Alloc) { // Get a list of agents that can access this memory pool. Inherently @@ -3683,10 +3888,8 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { }); // Enable all valid kernel agents to access the buffer. - if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) { - REPORT("%s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) + return std::move(Err); } return Alloc; diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h index 2ad0f2b7dd6c..68d82cbea0f3 100644 --- a/offload/plugins-nextgen/common/include/ErrorReporting.h +++ b/offload/plugins-nextgen/common/include/ErrorReporting.h @@ -61,7 +61,6 @@ class ErrorReporter { /// Return a nice name for an TargetAllocTy. static StringRef getAllocTyName(TargetAllocTy Kind) { switch (Kind) { - case TARGET_ALLOC_DEVICE_NON_BLOCKING: case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: return "device memory"; diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h index d62516d20764..b4e3712d9c98 100644 --- a/offload/plugins-nextgen/common/include/JIT.h +++ b/offload/plugins-nextgen/common/include/JIT.h @@ -51,27 +51,22 @@ struct JITEngine { /// Run jit compilation if \p Image is a bitcode image, otherwise simply /// return \p Image. It is expected to return a memory buffer containing the /// generated device image that could be loaded to the device directly. - Expected<const __tgt_device_image *> - process(const __tgt_device_image &Image, - target::plugin::GenericDeviceTy &Device); - - /// Remove \p Image from the jit engine's cache - void erase(const __tgt_device_image &Image, - target::plugin::GenericDeviceTy &Device); + Expected<std::unique_ptr<MemoryBuffer>> + process(StringRef Image, target::plugin::GenericDeviceTy &Device); private: /// Compile the bitcode image \p Image and generate the binary image that can /// be loaded to the target device of the triple \p Triple architecture \p /// MCpu. \p PostProcessing will be called after codegen to handle cases such /// as assembler as an external tool. - Expected<const __tgt_device_image *> - compile(const __tgt_device_image &Image, const std::string &ComputeUnitKind, + Expected<std::unique_ptr<MemoryBuffer>> + compile(StringRef Image, const std::string &ComputeUnitKind, PostProcessingFn PostProcessing); /// Create or retrieve the object image file from the file system or via /// compilation of the \p Image. Expected<std::unique_ptr<MemoryBuffer>> - getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx, + getOrCreateObjFile(StringRef Image, LLVMContext &Ctx, const std::string &ComputeUnitKind); /// Run backend, which contains optimization and code generation. @@ -92,14 +87,6 @@ private: struct ComputeUnitInfo { /// LLVM Context in which the modules will be constructed. LLVMContext Context; - - /// A map of embedded IR images to the buffer used to store JITed code - DenseMap<const __tgt_device_image *, std::unique_ptr<MemoryBuffer>> - JITImages; - - /// A map of embedded IR images to JITed images. - DenseMap<const __tgt_device_image *, std::unique_ptr<__tgt_device_image>> - TgtImageMap; }; /// Map from (march) "CPUs" (e.g., sm_80, or gfx90a), which we call compute diff --git a/offload/plugins-nextgen/common/include/MemoryManager.h b/offload/plugins-nextgen/common/include/MemoryManager.h index a4f6e628c403..8f6c1adcdaa5 100644 --- a/offload/plugins-nextgen/common/include/MemoryManager.h +++ b/offload/plugins-nextgen/common/include/MemoryManager.h @@ -25,6 +25,10 @@ #include "Shared/Utils.h" #include "omptarget.h" +#include "llvm/Support/Error.h" + +namespace llvm { + /// Base class of per-device allocator. class DeviceAllocatorTy { public: @@ -32,11 +36,13 @@ public: /// Allocate a memory of size \p Size . \p HstPtr is used to assist the /// allocation. - virtual void *allocate(size_t Size, void *HstPtr, - TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; + virtual Expected<void *> + allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; /// Delete the pointer \p TgtPtr on the device - virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; + virtual Error free(void *TgtPtr, + TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; }; /// Class of memory manager. The memory manager is per-device by using @@ -134,17 +140,17 @@ class MemoryManagerTy { size_t SizeThreshold = 1U << 13; /// Request memory from target device - void *allocateOnDevice(size_t Size, void *HstPtr) const { + Expected<void *> allocateOnDevice(size_t Size, void *HstPtr) const { return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE); } /// Deallocate data on device - int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); } + Error deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); } /// This function is called when it tries to allocate memory on device but the /// device returns out of memory. It will first free all memory in the /// FreeList and try to allocate again. - void *freeAndAllocate(size_t Size, void *HstPtr) { + Expected<void *> freeAndAllocate(size_t Size, void *HstPtr) { std::vector<void *> RemoveList; // Deallocate all memory in FreeList @@ -154,7 +160,8 @@ class MemoryManagerTy { if (List.empty()) continue; for (const NodeTy &N : List) { - deleteOnDevice(N.Ptr); + if (auto Err = deleteOnDevice(N.Ptr)) + return Err; RemoveList.push_back(N.Ptr); } FreeLists[I].clear(); @@ -175,14 +182,22 @@ class MemoryManagerTy { /// allocate directly on the device. If a \p nullptr is returned, it might /// be because the device is OOM. In that case, it will free all unused /// memory and then try again. - void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) { - void *TgtPtr = allocateOnDevice(Size, HstPtr); + Expected<void *> allocateOrFreeAndAllocateOnDevice(size_t Size, + void *HstPtr) { + auto TgtPtrOrErr = allocateOnDevice(Size, HstPtr); + if (!TgtPtrOrErr) + return TgtPtrOrErr.takeError(); + + void *TgtPtr = *TgtPtrOrErr; // We cannot get memory from the device. It might be due to OOM. Let's // free all memory in FreeLists and try again. if (TgtPtr == nullptr) { DP("Failed to get memory on device. Free all memory in FreeLists and " "try again.\n"); - TgtPtr = freeAndAllocate(Size, HstPtr); + TgtPtrOrErr = freeAndAllocate(Size, HstPtr); + if (!TgtPtrOrErr) + return TgtPtrOrErr.takeError(); + TgtPtr = *TgtPtrOrErr; } if (TgtPtr == nullptr) @@ -204,16 +219,17 @@ public: /// Destructor ~MemoryManagerTy() { - for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end(); - ++Itr) { - assert(Itr->second.Ptr && "nullptr in map table"); - deleteOnDevice(Itr->second.Ptr); + for (auto &PtrToNode : PtrToNodeTable) { + assert(PtrToNode.second.Ptr && "nullptr in map table"); + if (auto Err = deleteOnDevice(PtrToNode.second.Ptr)) + REPORT("Failure to delete memory: %s\n", + toString(std::move(Err)).data()); } } /// Allocate memory of size \p Size from target device. \p HstPtr is used to /// assist the allocation. - void *allocate(size_t Size, void *HstPtr) { + Expected<void *> allocate(size_t Size, void *HstPtr) { // If the size is zero, we will not bother the target device. Just return // nullptr directly. if (Size == 0) @@ -228,11 +244,14 @@ public: DP("%zu is greater than the threshold %zu. Allocate it directly from " "device\n", Size, SizeThreshold); - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + auto TgtPtrOrErr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + if (!TgtPtrOrErr) + return TgtPtrOrErr.takeError(); - DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); + DP("Got target pointer " DPxMOD ". Return directly.\n", + DPxPTR(*TgtPtrOrErr)); - return TgtPtr; + return *TgtPtrOrErr; } NodeTy *NodePtr = nullptr; @@ -260,8 +279,11 @@ public: if (NodePtr == nullptr) { DP("Cannot find a node in the FreeLists. Allocate on device.\n"); // Allocate one on device - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + auto TgtPtrOrErr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + if (!TgtPtrOrErr) + return TgtPtrOrErr.takeError(); + void *TgtPtr = *TgtPtrOrErr; if (TgtPtr == nullptr) return nullptr; @@ -282,7 +304,7 @@ public: } /// Deallocate memory pointed by \p TgtPtr - int free(void *TgtPtr) { + Error free(void *TgtPtr) { DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr)); NodeTy *P = nullptr; @@ -314,7 +336,7 @@ public: FreeLists[B].insert(*P); } - return OFFLOAD_SUCCESS; + return Error::success(); } /// Get the size threshold from the environment variable @@ -344,4 +366,6 @@ public: constexpr const size_t MemoryManagerTy::BucketSize[]; constexpr const int MemoryManagerTy::NumBuckets; +} // namespace llvm + #endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index c9ab34b024b7..8c530bba3882 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -193,7 +193,7 @@ struct InfoTreeNode { InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {} InfoTreeNode(std::string Key, VariantType Value, std::string Units) - : Key(Key), Value(Value), Units(Units) {} + : Key(std::move(Key)), Value(Value), Units(std::move(Units)) {} /// Add a new info entry as a child of this node. The entry requires at least /// a key string in \p Key. The value in \p Value is optional and can be any @@ -202,7 +202,7 @@ struct InfoTreeNode { /// use that value for an appropriate olGetDeviceInfo query template <typename T = std::monostate> InfoTreeNode *add(std::string Key, T Value = T(), - const std::string &Units = std::string(), + std::string Units = std::string(), std::optional<DeviceInfo> DeviceInfoKey = std::nullopt) { assert(!Key.empty() && "Invalid info key"); @@ -217,7 +217,8 @@ struct InfoTreeNode { else ValueVariant = std::string{Value}; - auto Ptr = &Children->emplace_back(Key, ValueVariant, Units); + auto Ptr = + &Children->emplace_back(std::move(Key), ValueVariant, std::move(Units)); if (DeviceInfoKey) DeviceInfoMap[*DeviceInfoKey] = Children->size() - 1; @@ -306,26 +307,18 @@ class DeviceImageTy { /// not unique between different device; they may overlap. int32_t ImageId; - /// The pointer to the raw __tgt_device_image. - const __tgt_device_image *TgtImage; - const __tgt_device_image *TgtImageBitcode; + /// The managed image data. + std::unique_ptr<MemoryBuffer> Image; /// Reference to the device this image is loaded on. GenericDeviceTy &Device; - /// If this image has any global destructors that much be called. - /// FIXME: This is only required because we currently have no invariants - /// towards the lifetime of the underlying image. We should either copy - /// the image into memory locally or erase the pointers after init. - bool PendingGlobalDtors; - public: + virtual ~DeviceImageTy() = default; + DeviceImageTy(int32_t Id, GenericDeviceTy &Device, - const __tgt_device_image *Image) - : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device), - PendingGlobalDtors(false) { - assert(TgtImage && "Invalid target image"); - } + std::unique_ptr<MemoryBuffer> &&Image) + : ImageId(Id), Image(std::move(Image)), Device(Device) {} /// Get the image identifier within the device. int32_t getId() const { return ImageId; } @@ -333,33 +326,17 @@ public: /// Get the device that this image is loaded onto. GenericDeviceTy &getDevice() const { return Device; } - /// Get the pointer to the raw __tgt_device_image. - const __tgt_device_image *getTgtImage() const { return TgtImage; } - - void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) { - this->TgtImageBitcode = TgtImageBitcode; - } - - const __tgt_device_image *getTgtImageBitcode() const { - return TgtImageBitcode; - } - /// Get the image starting address. - void *getStart() const { return TgtImage->ImageStart; } + const void *getStart() const { return Image->getBufferStart(); } /// Get the image size. - size_t getSize() const { - return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); - } + size_t getSize() const { return Image->getBufferSize(); } /// Get a memory buffer reference to the whole image. MemoryBufferRef getMemoryBuffer() const { return MemoryBufferRef(StringRef((const char *)getStart(), getSize()), "Image"); } - /// Accessors to the boolean value - bool setPendingGlobalDtors() { return PendingGlobalDtors = true; } - bool hasPendingGlobalDtors() const { return PendingGlobalDtors; } }; /// Class implementing common functionalities of offload kernels. Each plugin @@ -388,6 +365,9 @@ struct GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; + virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice, + uint64_t DynamicMemSize) const = 0; + /// Get the kernel name. const char *getName() const { return Name.c_str(); } @@ -414,6 +394,7 @@ struct GenericKernelTy { case OMP_TGT_EXEC_MODE_SPMD: case OMP_TGT_EXEC_MODE_GENERIC: case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP: return true; } return false; @@ -431,6 +412,8 @@ protected: return "Generic"; case OMP_TGT_EXEC_MODE_GENERIC_SPMD: return "Generic-SPMD"; + case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP: + return "SPMD-No-Loop"; } llvm_unreachable("Unknown execution mode!"); } @@ -468,7 +451,8 @@ private: uint32_t BlockLimitClause[3], uint64_t LoopTripCount, uint32_t &NumThreads, bool IsNumThreadsFromUser) const; - /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. + /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop + /// or SPMD mode. bool isGenericSPMDMode() const { return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD; @@ -483,6 +467,10 @@ private: bool isBareMode() const { return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE; } + bool isNoLoopMode() const { + return KernelEnvironment.Configuration.ExecMode == + OMP_TGT_EXEC_MODE_SPMD_NO_LOOP; + } /// The kernel name. std::string Name; @@ -820,19 +808,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Load the binary image into the device and return the target table. Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin, - const __tgt_device_image *TgtImage); + StringRef TgtImage); virtual Expected<DeviceImageTy *> - loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0; + loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage, int32_t ImageId) = 0; /// Unload a previously loaded Image from the device Error unloadBinary(DeviceImageTy *Image); virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0; - /// Setup the device environment if needed. Notice this setup may not be run - /// on some plugins. By default, it will be executed, but plugins can change - /// this behavior by overriding the shouldSetupDeviceEnvironment function. - Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); - /// Setup the global device memory pool, if the plugin requires one. Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, uint64_t PoolSize); @@ -944,6 +927,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Instert a data fence between previous data operations and the following + /// operations if necessary for the device + virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0; + /// Exchange data between devices (device to device transfer). Calling this /// function is only valid if GenericPlugin::isDataExchangable() passing the /// two devices returns true. @@ -953,17 +940,26 @@ struct GenericDeviceTy : public DeviceAllocatorTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Fill data on the device with a pattern from the host + Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, __tgt_async_info *AsyncInfo); + virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr, + int64_t PatternSize, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Run the kernel associated with \p EntryPtr Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo); - /// Initialize a __tgt_async_info structure. Related to interop features. + /// Initialize a __tgt_async_info structure. Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr); virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; - /// Initialize a __tgt_device_info structure. Related to interop features. - Error initDeviceInfo(__tgt_device_info *DeviceInfo); - virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0; + /// Enqueue a host call to AsyncInfo + Error enqueueHostCall(void (*Callback)(void *), void *UserData, + __tgt_async_info *AsyncInfo); + virtual Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, + AsyncInfoWrapperTy &AsyncInfo) = 0; /// Create an event. Error createEvent(void **EventPtrStorage); @@ -984,6 +980,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error waitEventImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Check if the event enqueued to AsyncInfo is complete + Expected<bool> isEventComplete(void *Event, __tgt_async_info *AsyncInfo); + virtual Expected<bool> + isEventCompleteImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Synchronize the current thread with the event. Error syncEvent(void *EventPtr); virtual Error syncEventImpl(void *EventPtr) = 0; @@ -1010,6 +1011,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy { uint32_t getDefaultNumBlocks() const { return GridValues.GV_Default_Num_Teams; } + uint32_t getDebugKind() const { return OMPX_DebugKind; } uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } @@ -1150,11 +1152,6 @@ private: virtual Error getDeviceHeapSize(uint64_t &V) = 0; virtual Error setDeviceHeapSize(uint64_t V) = 0; - /// Indicate whether the device should setup the device environment. Notice - /// that returning false in this function will change the behavior of the - /// setupDeviceEnvironment() function. - virtual bool shouldSetupDeviceEnvironment() const { return true; } - /// Indicate whether the device should setup the global device memory pool. If /// false is return the value on the device will be uninitialized. virtual bool shouldSetupDeviceMemoryPool() const { return true; } @@ -1210,7 +1207,7 @@ protected: enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING }; /// Array of peer access states with the rest of devices. This means that if - /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE, + /// the device I has a matrix PeerAccesses with PeerAccesses == AVAILABLE, /// the device I can access device J's memory directly. However, notice this /// does not mean that device J can access device I's memory directly. llvm::SmallVector<PeerAccessState> PeerAccesses; @@ -1378,10 +1375,10 @@ public: /// Returns non-zero if the \p Image is compatible with the plugin. This /// function does not require the plugin to be initialized before use. - int32_t is_plugin_compatible(__tgt_device_image *Image); + int32_t isPluginCompatible(StringRef Image); /// Returns non-zero if the \p Image is compatible with the device. - int32_t is_device_compatible(int32_t DeviceId, __tgt_device_image *Image); + int32_t isDeviceCompatible(int32_t DeviceId, StringRef Image); /// Returns non-zero if the plugin device has been initialized. int32_t is_device_initialized(int32_t DeviceId) const; @@ -1448,6 +1445,10 @@ public: int DstDeviceId, void *DstPtr, int64_t Size, __tgt_async_info *AsyncInfo); + /// Places a fence between previous data movements and following data + /// movements if necessary on the device + int32_t data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo); + /// Begin executing a kernel on the given device. int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs, @@ -1485,10 +1486,6 @@ public: /// Creates an asynchronous queue for the given plugin. int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr); - /// Creates device information to be used for diagnostics. - int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo, - const char **ErrStr); - /// Sets the offset into the devices for use by OMPT. int32_t set_device_identifier(int32_t UserId, int32_t DeviceId); diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp index 00720fa2d810..881e27dad384 100644 --- a/offload/plugins-nextgen/common/src/JIT.cpp +++ b/offload/plugins-nextgen/common/src/JIT.cpp @@ -49,13 +49,6 @@ using namespace omp::target; namespace { -bool isImageBitcode(const __tgt_device_image &Image) { - StringRef Binary(reinterpret_cast<const char *>(Image.ImageStart), - utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); - - return identify_magic(Binary) == file_magic::bitcode; -} - Expected<std::unique_ptr<Module>> createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB, LLVMContext &Context) { @@ -66,12 +59,10 @@ createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB, "failed to create module"); return std::move(Mod); } -Expected<std::unique_ptr<Module>> -createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) { - StringRef Data((const char *)Image.ImageStart, - utils::getPtrDiff(Image.ImageEnd, Image.ImageStart)); +Expected<std::unique_ptr<Module>> createModuleFromImage(StringRef Image, + LLVMContext &Context) { std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer( - Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false); + Image, /*BufferName=*/"", /*RequiresNullTerminator=*/false); return createModuleFromMemoryBuffer(MB, Context); } @@ -189,9 +180,10 @@ Expected<std::unique_ptr<MemoryBuffer>> JITEngine::backend(Module &M, const std::string &ComputeUnitKind, unsigned OptLevel) { - auto RemarksFileOrErr = setupLLVMOptimizationRemarks( - M.getContext(), /*RemarksFilename=*/"", /*RemarksPasses=*/"", - /*RemarksFormat=*/"", /*RemarksWithHotness=*/false); + Expected<LLVMRemarkFileHandle> RemarksFileOrErr = + setupLLVMOptimizationRemarks( + M.getContext(), /*RemarksFilename=*/"", /*RemarksPasses=*/"", + /*RemarksFormat=*/"", /*RemarksWithHotness=*/false); if (Error E = RemarksFileOrErr.takeError()) return std::move(E); if (*RemarksFileOrErr) @@ -238,7 +230,7 @@ JITEngine::backend(Module &M, const std::string &ComputeUnitKind, } Expected<std::unique_ptr<MemoryBuffer>> -JITEngine::getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx, +JITEngine::getOrCreateObjFile(StringRef Image, LLVMContext &Ctx, const std::string &ComputeUnitKind) { // Check if the user replaces the module at runtime with a finished object. @@ -277,58 +269,28 @@ JITEngine::getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx, return backend(*Mod, ComputeUnitKind, JITOptLevel); } -Expected<const __tgt_device_image *> -JITEngine::compile(const __tgt_device_image &Image, - const std::string &ComputeUnitKind, +Expected<std::unique_ptr<MemoryBuffer>> +JITEngine::compile(StringRef Image, const std::string &ComputeUnitKind, PostProcessingFn PostProcessing) { std::lock_guard<std::mutex> Lock(ComputeUnitMapMutex); - // Check if we JITed this image for the given compute unit kind before. - ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind]; - if (CUI.TgtImageMap.contains(&Image)) - return CUI.TgtImageMap[&Image].get(); - - auto ObjMBOrErr = getOrCreateObjFile(Image, CUI.Context, ComputeUnitKind); + LLVMContext Ctz; + auto ObjMBOrErr = getOrCreateObjFile(Image, Ctz, ComputeUnitKind); if (!ObjMBOrErr) return ObjMBOrErr.takeError(); - auto ImageMBOrErr = PostProcessing(std::move(*ObjMBOrErr)); - if (!ImageMBOrErr) - return ImageMBOrErr.takeError(); - - CUI.JITImages.insert({&Image, std::move(*ImageMBOrErr)}); - auto &ImageMB = CUI.JITImages[&Image]; - CUI.TgtImageMap.insert({&Image, std::make_unique<__tgt_device_image>()}); - auto &JITedImage = CUI.TgtImageMap[&Image]; - *JITedImage = Image; - JITedImage->ImageStart = const_cast<char *>(ImageMB->getBufferStart()); - JITedImage->ImageEnd = const_cast<char *>(ImageMB->getBufferEnd()); - - return JITedImage.get(); + return PostProcessing(std::move(*ObjMBOrErr)); } -Expected<const __tgt_device_image *> -JITEngine::process(const __tgt_device_image &Image, - target::plugin::GenericDeviceTy &Device) { - const std::string &ComputeUnitKind = Device.getComputeUnitKind(); +Expected<std::unique_ptr<MemoryBuffer>> +JITEngine::process(StringRef Image, target::plugin::GenericDeviceTy &Device) { + assert(identify_magic(Image) == file_magic::bitcode && "Image not LLVM-IR"); + const std::string &ComputeUnitKind = Device.getComputeUnitKind(); PostProcessingFn PostProcessing = [&Device](std::unique_ptr<MemoryBuffer> MB) -> Expected<std::unique_ptr<MemoryBuffer>> { return Device.doJITPostProcessing(std::move(MB)); }; - if (isImageBitcode(Image)) - return compile(Image, ComputeUnitKind, PostProcessing); - - return &Image; -} - -void JITEngine::erase(const __tgt_device_image &Image, - target::plugin::GenericDeviceTy &Device) { - std::lock_guard<std::mutex> Lock(ComputeUnitMapMutex); - const std::string &ComputeUnitKind = Device.getComputeUnitKind(); - ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind]; - - CUI.TgtImageMap.erase(&Image); - CUI.JITImages.erase(&Image); + return compile(Image, ComputeUnitKind, PostProcessing); } diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 083d41659a46..db43cbe49cc2 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -73,11 +73,17 @@ private: }; llvm::SmallVector<GlobalEntry> GlobalEntries{}; - void *suggestAddress(uint64_t MaxMemoryAllocation) { + Expected<void *> suggestAddress(uint64_t MaxMemoryAllocation) { // Get a valid pointer address for this system - void *Addr = + auto AddrOrErr = Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); - Device->free(Addr); + if (!AddrOrErr) + return AddrOrErr.takeError(); + + void *Addr = *AddrOrErr; + if (auto Err = Device->free(Addr)) + return std::move(Err); + // Align Address to MaxMemoryAllocation Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation); return Addr; @@ -86,8 +92,12 @@ private: Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) { size_t ASize = MaxMemoryAllocation; - if (!VAddr && isRecording()) - VAddr = suggestAddress(MaxMemoryAllocation); + if (!VAddr && isRecording()) { + auto VAddrOrErr = suggestAddress(MaxMemoryAllocation); + if (!VAddrOrErr) + return VAddrOrErr.takeError(); + VAddr = *VAddrOrErr; + } DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr); @@ -117,8 +127,11 @@ private: constexpr size_t STEP = 1024 * 1024 * 1024ULL; MemoryStart = nullptr; for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) { - MemoryStart = + auto MemoryStartOrErr = Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); + if (!MemoryStartOrErr) + return MemoryStartOrErr.takeError(); + MemoryStart = *MemoryStartOrErr; if (MemoryStart) break; } @@ -214,15 +227,7 @@ public: raw_fd_ostream OS(ImageName, EC); if (EC) report_fatal_error("Error saving image : " + StringRef(EC.message())); - if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) { - size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd, - TgtImageBitcode->ImageStart); - MemoryBufferRef MBR = MemoryBufferRef( - StringRef((const char *)TgtImageBitcode->ImageStart, Size), ""); - OS << MBR.getBuffer(); - } else { - OS << Image.getMemoryBuffer().getBuffer(); - } + OS << Image.getMemoryBuffer().getBuffer(); OS.close(); } @@ -360,65 +365,19 @@ public: return Plugin::success(); } - void deinit() { + Error deinit() { if (UsedVAMap) { if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize)) - report_fatal_error("Error on releasing virtual memory space"); + return Err; } else { - Device->free(MemoryStart); + if (auto Err = Device->free(MemoryStart)) + return Err; } + return Plugin::success(); } }; } // namespace llvm::omp::target::plugin -// Extract the mapping of host function pointers to device function pointers -// from the entry table. Functions marked as 'indirect' in OpenMP will have -// offloading entries generated for them which map the host's function pointer -// to a global containing the corresponding function pointer on the device. -static Expected<std::pair<void *, uint64_t>> -setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device, - DeviceImageTy &Image) { - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - - llvm::ArrayRef<llvm::offloading::EntryTy> Entries( - Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd); - llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable; - for (const auto &Entry : Entries) { - if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 || - !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT)) - continue; - - assert(Entry.Size == sizeof(void *) && "Global not a function pointer?"); - auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); - - GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size); - if (auto Err = - Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) - return std::move(Err); - - HstPtr = Entry.Address; - if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(), - Entry.Size, nullptr)) - return std::move(Err); - } - - // If we do not have any indirect globals we exit early. - if (IndirectCallTable.empty()) - return std::pair{nullptr, 0}; - - // Sort the array to allow for more efficient lookup of device pointers. - llvm::sort(IndirectCallTable, - [](const auto &x, const auto &y) { return x.first < y.first; }); - - uint64_t TableSize = - IndirectCallTable.size() * sizeof(std::pair<void *, void *>); - void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE); - if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(), - TableSize, nullptr)) - return std::move(Err); - return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size()); -} - AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr) : Device(Device), @@ -662,6 +621,10 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit()); } + // Return the number of teams required to cover the loop iterations. + if (isNoLoopMode()) + return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1; + uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks(); uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max(); if (LoopTripCount > 0) { @@ -857,9 +820,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) { return Err; } - if (Image->getTgtImageBitcode()) - Plugin.getJIT().erase(*Image->getTgtImageBitcode(), Image->getDevice()); - return unloadBinaryImpl(Image); } @@ -893,7 +853,8 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { RecordReplayTy &RecordReplay = Plugin.getRecordReplay(); if (RecordReplay.isRecordingOrReplaying()) - RecordReplay.deinit(); + if (auto Err = RecordReplay.deinit()) + return Err; if (RPCServer) if (auto Err = RPCServer->deinitDevice(*this)) @@ -909,40 +870,33 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { return deinitImpl(); } -Expected<DeviceImageTy *> -GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, - const __tgt_device_image *InputTgtImage) { - assert(InputTgtImage && "Expected non-null target image"); - DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart)); - - auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this); - if (!PostJITImageOrErr) { - auto Err = PostJITImageOrErr.takeError(); - REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage, - DeviceId, toStringWithoutConsuming(Err).data()); - return Plugin::error(ErrorCode::COMPILE_FAILURE, std::move(Err), - "failure to jit IR image"); +Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, + StringRef InputTgtImage) { + DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage.bytes_begin())); + + std::unique_ptr<MemoryBuffer> Buffer; + if (identify_magic(InputTgtImage) == file_magic::bitcode) { + auto CompiledImageOrErr = Plugin.getJIT().process(InputTgtImage, *this); + if (!CompiledImageOrErr) { + return Plugin::error(ErrorCode::COMPILE_FAILURE, + CompiledImageOrErr.takeError(), + "failure to jit IR image"); + } + Buffer = std::move(*CompiledImageOrErr); + } else { + Buffer = MemoryBuffer::getMemBufferCopy(InputTgtImage); } // Load the binary and allocate the image object. Use the next available id // for the image id, which is the number of previously loaded images. - auto ImageOrErr = - loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size()); + auto ImageOrErr = loadBinaryImpl(std::move(Buffer), LoadedImages.size()); if (!ImageOrErr) return ImageOrErr.takeError(); - DeviceImageTy *Image = *ImageOrErr; - assert(Image != nullptr && "Invalid image"); - if (InputTgtImage != PostJITImageOrErr.get()) - Image->setTgtImageBitcode(InputTgtImage); // Add the image to list. LoadedImages.push_back(Image); - // Setup the device environment if needed. - if (auto Err = setupDeviceEnvironment(Plugin, *Image)) - return std::move(Err); - // Setup the global device memory pool if needed. if (!Plugin.getRecordReplay().isReplaying() && shouldSetupDeviceMemoryPool()) { @@ -960,12 +914,12 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, #ifdef OMPT_SUPPORT if (ompt::Initialized) { - size_t Bytes = - utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); + size_t Bytes = InputTgtImage.size(); performOmptCallback( device_load, Plugin.getUserId(DeviceId), /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr, - /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart, + /*ImgSize=*/Bytes, + /*HostAddr=*/const_cast<unsigned char *>(InputTgtImage.bytes_begin()), /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0); } #endif @@ -978,43 +932,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return Image; } -Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, - DeviceImageTy &Image) { - // There are some plugins that do not need this step. - if (!shouldSetupDeviceEnvironment()) - return Plugin::success(); - - // Obtain a table mapping host function pointers to device function pointers. - auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image); - if (!CallTablePairOrErr) - return CallTablePairOrErr.takeError(); - - DeviceEnvironmentTy DeviceEnvironment; - DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind; - DeviceEnvironment.NumDevices = Plugin.getNumDevices(); - // TODO: The device ID used here is not the real device ID used by OpenMP. - DeviceEnvironment.DeviceNum = DeviceId; - DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; - DeviceEnvironment.ClockFrequency = getClockFrequency(); - DeviceEnvironment.IndirectCallTable = - reinterpret_cast<uintptr_t>(CallTablePairOrErr->first); - DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second; - DeviceEnvironment.HardwareParallelism = getHardwareParallelism(); - - // Create the metainfo of the device environment global. - GlobalTy DevEnvGlobal("__omp_rtl_device_environment", - sizeof(DeviceEnvironmentTy), &DeviceEnvironment); - - // Write device environment values to the device. - GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); - if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) { - DP("Missing symbol %s, continue execution anyway.\n", - DevEnvGlobal.getName().data()); - consumeError(std::move(Err)); - } - return Plugin::success(); -} - Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, uint64_t PoolSize) { @@ -1337,16 +1254,19 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) { Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue) { + if (!AsyncInfo) + return Plugin::error(ErrorCode::INVALID_ARGUMENT, + "invalid async info queue"); + SmallVector<void *> AllocsToDelete{}; { std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex}; - if (!AsyncInfo || !AsyncInfo->Queue) - return Plugin::error(ErrorCode::INVALID_ARGUMENT, - "invalid async info queue"); - - if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue)) - return Err; + // This can be false when no work has been added to the AsyncInfo. In which + // case, the device has nothing to synchronize. + if (AsyncInfo->Queue) + if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue)) + return Err; std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations); } @@ -1391,10 +1311,12 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, switch (Kind) { case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE_NON_BLOCKING: case TARGET_ALLOC_DEVICE: if (MemoryManager) { - Alloc = MemoryManager->allocate(Size, HostPtr); + auto AllocOrErr = MemoryManager->allocate(Size, HostPtr); + if (!AllocOrErr) + return AllocOrErr.takeError(); + Alloc = *AllocOrErr; if (!Alloc) return Plugin::error(ErrorCode::OUT_OF_RESOURCES, "failed to allocate from memory manager"); @@ -1402,12 +1324,16 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, } [[fallthrough]]; case TARGET_ALLOC_HOST: - case TARGET_ALLOC_SHARED: - Alloc = allocate(Size, HostPtr, Kind); + case TARGET_ALLOC_SHARED: { + auto AllocOrErr = allocate(Size, HostPtr, Kind); + if (!AllocOrErr) + return AllocOrErr.takeError(); + Alloc = *AllocOrErr; if (!Alloc) return Plugin::error(ErrorCode::OUT_OF_RESOURCES, "failed to allocate from device allocator"); } + } // Report error if the memory manager or the device allocator did not return // any memory buffer. @@ -1479,29 +1405,19 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { #undef DEALLOCATION_ERROR } - int Res; switch (Kind) { case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE_NON_BLOCKING: case TARGET_ALLOC_DEVICE: if (MemoryManager) { - Res = MemoryManager->free(TgtPtr); - if (Res) - return Plugin::error( - ErrorCode::OUT_OF_RESOURCES, - "failure to deallocate device pointer %p via memory manager", - TgtPtr); + if (auto Err = MemoryManager->free(TgtPtr)) + return Err; break; } [[fallthrough]]; case TARGET_ALLOC_HOST: case TARGET_ALLOC_SHARED: - Res = free(TgtPtr, Kind); - if (Res) - return Plugin::error( - ErrorCode::UNKNOWN, - "failure to deallocate device pointer %p via device deallocator", - TgtPtr); + if (auto Err = free(TgtPtr, Kind)) + return Err; } // Unregister deallocated pinned memory buffer if the type is host memory. @@ -1540,6 +1456,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, return Err; } +Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr, + int64_t PatternSize, int64_t Size, + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + auto Err = + dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; +} + Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, @@ -1589,10 +1515,13 @@ Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) { return Err; } -Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) { - assert(DeviceInfo && "Invalid device info"); +Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData, + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); - return initDeviceInfoImpl(DeviceInfo); + auto Err = enqueueHostCallImpl(Callback, UserData, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; } Error GenericDeviceTy::printInfo() { @@ -1648,6 +1577,22 @@ Expected<bool> GenericDeviceTy::hasPendingWork(__tgt_async_info *AsyncInfo) { return Res; } +Expected<bool> GenericDeviceTy::isEventComplete(void *Event, + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + auto Res = isEventCompleteImpl(Event, AsyncInfoWrapper); + if (auto Err = Res.takeError()) { + AsyncInfoWrapper.finalize(Err); + return Err; + } + + auto Err = Plugin::success(); + AsyncInfoWrapper.finalize(Err); + if (Err) + return Err; + return Res; +} + Error GenericDeviceTy::syncEvent(void *EventPtr) { return syncEventImpl(EventPtr); } @@ -1774,28 +1719,26 @@ Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const { int32_t GenericPluginTy::is_initialized() const { return Initialized; } -int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { - StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart), - utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); - +int32_t GenericPluginTy::isPluginCompatible(StringRef Image) { auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); - DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str()); + DP("Failure to check validity of image %p: %s", Image.data(), + ErrStr.c_str()); return false; }; - switch (identify_magic(Buffer)) { + switch (identify_magic(Image)) { case file_magic::elf: case file_magic::elf_relocatable: case file_magic::elf_executable: case file_magic::elf_shared_object: case file_magic::elf_core: { - auto MatchOrErr = checkELFImage(Buffer); + auto MatchOrErr = checkELFImage(Image); if (Error Err = MatchOrErr.takeError()) return HandleError(std::move(Err)); return *MatchOrErr; } case file_magic::bitcode: { - auto MatchOrErr = checkBitcodeImage(Buffer); + auto MatchOrErr = checkBitcodeImage(Image); if (Error Err = MatchOrErr.takeError()) return HandleError(std::move(Err)); return *MatchOrErr; @@ -1805,36 +1748,33 @@ int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { } } -int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId, - __tgt_device_image *Image) { - StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart), - utils::getPtrDiff(Image->ImageEnd, Image->ImageStart)); - +int32_t GenericPluginTy::isDeviceCompatible(int32_t DeviceId, StringRef Image) { auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); - DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str()); + DP("Failure to check validity of image %p: %s", Image.data(), + ErrStr.c_str()); return false; }; - switch (identify_magic(Buffer)) { + switch (identify_magic(Image)) { case file_magic::elf: case file_magic::elf_relocatable: case file_magic::elf_executable: case file_magic::elf_shared_object: case file_magic::elf_core: { - auto MatchOrErr = checkELFImage(Buffer); + auto MatchOrErr = checkELFImage(Image); if (Error Err = MatchOrErr.takeError()) return HandleError(std::move(Err)); if (!*MatchOrErr) return false; // Perform plugin-dependent checks for the specific architecture if needed. - auto CompatibleOrErr = isELFCompatible(DeviceId, Buffer); + auto CompatibleOrErr = isELFCompatible(DeviceId, Image); if (Error Err = CompatibleOrErr.takeError()) return HandleError(std::move(Err)); return *CompatibleOrErr; } case file_magic::bitcode: { - auto MatchOrErr = checkBitcodeImage(Buffer); + auto MatchOrErr = checkBitcodeImage(Image); if (Error Err = MatchOrErr.takeError()) return HandleError(std::move(Err)); return *MatchOrErr; @@ -1895,7 +1835,9 @@ int32_t GenericPluginTy::load_binary(int32_t DeviceId, __tgt_device_binary *Binary) { GenericDeviceTy &Device = getDevice(DeviceId); - auto ImageOrErr = Device.loadBinary(*this, TgtImage); + StringRef Buffer(reinterpret_cast<const char *>(TgtImage->ImageStart), + utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart)); + auto ImageOrErr = Device.loadBinary(*this, Buffer); if (!ImageOrErr) { auto Err = ImageOrErr.takeError(); REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage, @@ -2180,21 +2122,6 @@ int32_t GenericPluginTy::init_async_info(int32_t DeviceId, return OFFLOAD_SUCCESS; } -int32_t GenericPluginTy::init_device_info(int32_t DeviceId, - __tgt_device_info *DeviceInfo, - const char **ErrStr) { - *ErrStr = ""; - - auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo); - if (Err) { - REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n", - DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; -} - int32_t GenericPluginTy::set_device_identifier(int32_t UserId, int32_t DeviceId) { UserDeviceIds[DeviceId] = UserId; @@ -2217,8 +2144,7 @@ int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size, GenericGlobalHandlerTy &GHandler = getGlobalHandler(); if (auto Err = GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) { - REPORT("Failure to look up global address: %s\n", - toString(std::move(Err)).data()); + consumeError(std::move(Err)); return OFFLOAD_FAIL; } @@ -2324,3 +2250,15 @@ int32_t GenericPluginTy::async_barrier(omp_interop_val_t *Interop) { } return OFFLOAD_SUCCESS; } + +int32_t GenericPluginTy::data_fence(int32_t DeviceId, + __tgt_async_info *AsyncInfo) { + auto Err = getDevice(DeviceId).dataFence(AsyncInfo); + if (Err) { + REPORT("failure to place data fence on device %d: %s\n", DeviceId, + toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp index 678be78b56af..e19f2ef94de6 100644 --- a/offload/plugins-nextgen/common/src/RPC.cpp +++ b/offload/plugins-nextgen/common/src/RPC.cpp @@ -28,15 +28,22 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device, switch (Port.get_opcode()) { case LIBC_MALLOC: { Port.recv_and_send([&](rpc::Buffer *Buffer, uint32_t) { - Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate( - Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING)); + auto PtrOrErr = + Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE); + void *Ptr = nullptr; + if (!PtrOrErr) + llvm::consumeError(PtrOrErr.takeError()); + else + Ptr = *PtrOrErr; + Buffer->data[0] = reinterpret_cast<uintptr_t>(Ptr); }); break; } case LIBC_FREE: { Port.recv([&](rpc::Buffer *Buffer, uint32_t) { - Device.free(reinterpret_cast<void *>(Buffer->data[0]), - TARGET_ALLOC_DEVICE_NON_BLOCKING); + if (auto Err = Device.free(reinterpret_cast<void *>(Buffer->data[0]), + TARGET_ALLOC_DEVICE)) + llvm::consumeError(std::move(Err)); }); break; } @@ -171,9 +178,13 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, plugin::DeviceImageTy &Image) { uint64_t NumPorts = std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT); - void *RPCBuffer = Device.allocate( + auto RPCBufferOrErr = Device.allocate( rpc::Server::allocation_size(Device.getWarpSize(), NumPorts), nullptr, TARGET_ALLOC_HOST); + if (!RPCBufferOrErr) + return RPCBufferOrErr.takeError(); + + void *RPCBuffer = *RPCBufferOrErr; if (!RPCBuffer) return plugin::Plugin::error( error::ErrorCode::UNKNOWN, @@ -198,7 +209,8 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) { std::lock_guard<decltype(BufferMutex)> Lock(BufferMutex); - Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST); + if (auto Err = Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST)) + return Err; Buffers[Device.getDeviceId()] = nullptr; Devices[Device.getDeviceId()] = nullptr; return Error::success(); diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index 361a781e8f9b..f5b2d074a47e 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4) DLWRAP(cuMemcpyHtoD, 3) DLWRAP(cuMemcpyHtoDAsync, 4) +DLWRAP(cuMemsetD8Async, 4) +DLWRAP(cuMemsetD16Async, 4) +DLWRAP(cuMemsetD32Async, 4) +DLWRAP(cuMemsetD2D8Async, 6) +DLWRAP(cuMemsetD2D16Async, 6) +DLWRAP(cuMemsetD2D32Async, 6) + DLWRAP(cuMemFree, 1) DLWRAP(cuMemFreeHost, 1) DLWRAP(cuMemFreeAsync, 2) @@ -72,6 +79,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3) DLWRAP(cuDevicePrimaryCtxSetFlags, 2) DLWRAP(cuDevicePrimaryCtxRetain, 2) DLWRAP(cuModuleLoadDataEx, 5) +DLWRAP(cuOccupancyMaxPotentialBlockSize, 6) DLWRAP(cuDeviceCanAccessPeer, 3) DLWRAP(cuCtxEnablePeerAccess, 2) @@ -82,6 +90,7 @@ DLWRAP(cuCtxSetLimit, 2) DLWRAP(cuEventCreate, 2) DLWRAP(cuEventRecord, 2) +DLWRAP(cuEventQuery, 1) DLWRAP(cuStreamWaitEvent, 3) DLWRAP(cuEventSynchronize, 1) DLWRAP(cuEventDestroy, 1) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index b6c022c8e7e8..dec4e33508c6 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01; static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02; typedef void (*CUstreamCallback)(CUstream, CUresult, void *); +typedef size_t (*CUoccupancyB2DSize)(int); CUresult cuCtxGetDevice(CUdevice *); CUresult cuDeviceGet(CUdevice *, int); @@ -321,6 +322,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); +CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); +CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); +CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); + CUresult cuMemFree(CUdeviceptr); CUresult cuMemFreeHost(void *); CUresult cuMemFreeAsync(CUdeviceptr, CUstream); @@ -352,6 +363,7 @@ CUresult cuCtxSetLimit(CUlimit, size_t); CUresult cuEventCreate(CUevent *, unsigned int); CUresult cuEventRecord(CUevent, CUstream); +CUresult cuEventQuery(CUevent); CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int); CUresult cuEventSynchronize(CUevent); CUresult cuEventDestroy(CUevent); @@ -372,5 +384,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, CUresult cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); #endif diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index e94f3f6af7dd..db94f7f2dd99 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -81,8 +81,8 @@ CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {} struct CUDADeviceImageTy : public DeviceImageTy { /// Create the CUDA image with the id and the target image pointer. CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, - const __tgt_device_image *TgtImage) - : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {} + std::unique_ptr<MemoryBuffer> &&TgtImage) + : DeviceImageTy(ImageId, Device, std::move(TgtImage)), Module(nullptr) {} /// Load the image as a CUDA module. Error loadModule() { @@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Return maximum block size for maximum occupancy + Expected<uint64_t> maxGroupSize(GenericDeviceTy &, + uint64_t DynamicMemSize) const override { + int MinGridSize; + int MaxBlockSize; + auto Res = cuOccupancyMaxPotentialBlockSize( + &MinGridSize, &MaxBlockSize, Func, NULL, DynamicMemSize, INT_MAX); + if (auto Err = Plugin::check( + Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) { + return Err; + } + return MaxBlockSize; + } + private: /// The CUDA kernel function to execute. CUfunction Func; @@ -371,6 +385,8 @@ struct CUDADeviceTy : public GenericDeviceTy { if (auto Err = CUDAImage.unloadModule()) return Err; + // Destroy the associated memory and invalidate the object. + Plugin.free(Image); return Plugin::success(); } @@ -404,20 +420,12 @@ struct CUDADeviceTy : public GenericDeviceTy { virtual Error callGlobalConstructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { - // Check for the presence of global destructors at initialization time. This - // is required when the image may be deallocated before destructors are run. - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini")) - Image.setPendingGlobalDtors(); - return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); } virtual Error callGlobalDestructors(GenericPluginTy &Plugin, DeviceImageTy &Image) override { - if (Image.hasPendingGlobalDtors()) - return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); - return Plugin::success(); + return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); } Expected<std::unique_ptr<MemoryBuffer>> @@ -535,14 +543,15 @@ struct CUDADeviceTy : public GenericDeviceTy { CUdevice getCUDADevice() const { return Device; } /// Load the binary image into the device and allocate an image object. - Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage, - int32_t ImageId) override { + Expected<DeviceImageTy *> + loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage, + int32_t ImageId) override { if (auto Err = setContext()) return std::move(Err); // Allocate and initialize the image object. CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>(); - new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage); + new (CUDAImage) CUDADeviceImageTy(ImageId, *this, std::move(TgtImage)); // Load the CUDA module. if (auto Err = CUDAImage->loadModule()) @@ -552,14 +561,12 @@ struct CUDADeviceTy : public GenericDeviceTy { } /// Allocate memory on the device or related to the device. - void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + Expected<void *> allocate(size_t Size, void *, TargetAllocTy Kind) override { if (Size == 0) return nullptr; - if (auto Err = setContext()) { - REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = setContext()) + return std::move(Err); void *MemAlloc = nullptr; CUdeviceptr DevicePtr; @@ -578,35 +585,20 @@ struct CUDADeviceTy : public GenericDeviceTy { Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL); MemAlloc = (void *)DevicePtr; break; - case TARGET_ALLOC_DEVICE_NON_BLOCKING: { - CUstream Stream; - if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) - break; - if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream))) - break; - cuStreamSynchronize(Stream); - Res = cuStreamDestroy(Stream); - MemAlloc = (void *)DevicePtr; - } } - if (auto Err = - Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) { - REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) + return std::move(Err); return MemAlloc; } /// Deallocate memory on the device or related to the device. - int free(void *TgtPtr, TargetAllocTy Kind) override { + Error free(void *TgtPtr, TargetAllocTy Kind) override { if (TgtPtr == nullptr) - return OFFLOAD_SUCCESS; + return Plugin::success(); - if (auto Err = setContext()) { - REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } + if (auto Err = setContext()) + return Err; CUresult Res; switch (Kind) { @@ -618,22 +610,9 @@ struct CUDADeviceTy : public GenericDeviceTy { case TARGET_ALLOC_HOST: Res = cuMemFreeHost(TgtPtr); break; - case TARGET_ALLOC_DEVICE_NON_BLOCKING: { - CUstream Stream; - if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) - break; - cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream); - cuStreamSynchronize(Stream); - if ((Res = cuStreamDestroy(Stream))) - break; - } } - if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) { - REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; + return Plugin::check(Res, "error in cuMemFree[Host]: %s"); } /// Synchronize current thread with the pending operations on the async info. @@ -844,6 +823,64 @@ struct CUDADeviceTy : public GenericDeviceTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (auto Err = setContext()) + return Err; + + CUstream Stream; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + CUresult Res; + size_t N = Size / PatternSize; + if (PatternSize == 1) { + Res = cuMemsetD8Async((CUdeviceptr)TgtPtr, + *(static_cast<const uint8_t *>(PatternPtr)), N, + Stream); + } else if (PatternSize == 2) { + Res = cuMemsetD16Async((CUdeviceptr)TgtPtr, + *(static_cast<const uint16_t *>(PatternPtr)), N, + Stream); + } else if (PatternSize == 4) { + Res = cuMemsetD32Async((CUdeviceptr)TgtPtr, + *(static_cast<const uint32_t *>(PatternPtr)), N, + Stream); + } else { + // For larger patterns we can do a series of strided fills to copy the + // pattern efficiently + int64_t MemsetSize = PatternSize % 4u == 0u ? 4u + : PatternSize % 2u == 0u ? 2u + : 1u; + + int64_t NumberOfSteps = PatternSize / MemsetSize; + int64_t Pitch = NumberOfSteps * MemsetSize; + int64_t Height = Size / PatternSize; + + for (auto Step = 0u; Step < NumberOfSteps; ++Step) { + if (MemsetSize == 4) { + Res = cuMemsetD2D32Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast<const uint32_t *>(PatternPtr) + Step), 1u, Height, + Stream); + } else if (MemsetSize == 2) { + Res = cuMemsetD2D16Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast<const uint16_t *>(PatternPtr) + Step), 1u, Height, + Stream); + } else { + Res = cuMemsetD2D8Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast<const uint8_t *>(PatternPtr) + Step), 1u, Height, + Stream); + } + } + } + + return Plugin::check(Res, "error in cuMemset: %s"); + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { if (auto Err = setContext()) @@ -856,23 +893,70 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } - /// Initialize the device info for interoperability purposes. - Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { - assert(Context && "Context is null"); - assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device"); + /// Insert a data fence between previous data operations and the following + /// operations. This is a no-op for CUDA devices as operations inserted into + /// a queue are in-order. + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } - if (auto Err = setContext()) - return Err; + interop_spec_t selectInteropPreference(int32_t InteropType, + int32_t NumPrefers, + interop_spec_t *Prefers) override { + return interop_spec_t{tgt_fr_cuda, {true, 0}, 0}; + } + + Expected<omp_interop_val_t *> + createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override { + auto *Ret = new omp_interop_val_t( + DeviceId, static_cast<kmp_interop_type_t>(InteropType)); + Ret->fr_id = tgt_fr_cuda; + Ret->vendor_id = omp_vendor_nvidia; + + if (InteropType == kmp_interop_type_target || + InteropType == kmp_interop_type_targetsync) { + Ret->device_info.Platform = nullptr; + Ret->device_info.Device = reinterpret_cast<void *>(Device); + Ret->device_info.Context = Context; + } + + if (InteropType == kmp_interop_type_targetsync) { + Ret->async_info = new __tgt_async_info(); + if (auto Err = setContext()) + return Err; + CUstream Stream; + if (auto Err = CUDAStreamManager.getResource(Stream)) + return Err; + + Ret->async_info->Queue = Stream; + } + return Ret; + } - if (!DeviceInfo->Context) - DeviceInfo->Context = Context; + Error releaseInterop(omp_interop_val_t *Interop) override { + if (!Interop) + return Plugin::success(); - if (!DeviceInfo->Device) - DeviceInfo->Device = reinterpret_cast<void *>(Device); + if (Interop->async_info) + delete Interop->async_info; + delete Interop; return Plugin::success(); } + Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, + AsyncInfoWrapperTy &AsyncInfo) override { + if (auto Err = setContext()) + return Err; + + CUstream Stream; + if (auto Err = getStream(AsyncInfo, Stream)) + return Err; + + CUresult Res = cuLaunchHostFunc(Stream, Callback, UserData); + return Plugin::check(Res, "error in cuStreamLaunchHostFunc: %s"); + }; + /// Create an event. Error createEventImpl(void **EventPtrStorage) override { CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage); @@ -914,9 +998,33 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::check(Res, "error in cuStreamWaitEvent: %s"); } - // TODO: This should be implementable on CUDA Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override { - return true; + CUstream Stream; + if (auto Err = getStream(AsyncInfo, Stream)) + return Err; + + CUresult Ret = cuStreamQuery(Stream); + if (Ret == CUDA_SUCCESS) + return false; + + if (Ret == CUDA_ERROR_NOT_READY) + return true; + + return Plugin::check(Ret, "error in cuStreamQuery: %s"); + } + + Expected<bool> isEventCompleteImpl(void *EventPtr, + AsyncInfoWrapperTy &) override { + CUevent Event = reinterpret_cast<CUevent>(EventPtr); + + CUresult Ret = cuEventQuery(Event); + if (Ret == CUDA_SUCCESS) + return true; + + if (Ret == CUDA_ERROR_NOT_READY) + return false; + + return Plugin::check(Ret, "error in cuEventQuery: %s"); } /// Synchronize the current thread with the event. @@ -944,18 +1052,27 @@ struct CUDADeviceTy : public GenericDeviceTy { Info.add("CUDA OpenMP Device Number", DeviceId); Res = cuDeviceGetName(TmpChar, 1000, Device); - if (Res == CUDA_SUCCESS) + if (Res == CUDA_SUCCESS) { Info.add("Device Name", TmpChar, "", DeviceInfo::NAME); + Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME); + } Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR); + Info.add("Vendor ID", uint64_t{4318}, "", DeviceInfo::VENDOR_ID); + + Info.add("Memory Address Size", std::numeric_limits<CUdeviceptr>::digits, + "bits", DeviceInfo::ADDRESS_BITS); + Res = cuDeviceTotalMem(&TmpSt, Device); if (Res == CUDA_SUCCESS) - Info.add("Global Memory Size", TmpSt, "bytes"); + Info.add("Global Memory Size", TmpSt, "bytes", + DeviceInfo::GLOBAL_MEM_SIZE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt); if (Res == CUDA_SUCCESS) - Info.add("Number of Multiprocessors", TmpInt); + Info.add("Number of Multiprocessors", TmpInt, "", + DeviceInfo::NUM_COMPUTE_UNITS); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt); if (Res == CUDA_SUCCESS) @@ -995,7 +1112,13 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) MaxBlock.add("z", TmpInt); - auto &MaxGrid = *Info.add("Maximum Grid Dimensions", ""); + // TODO: I assume CUDA devices have no limit on the amount of threads, + // verify this + Info.add("Maximum Grid Size", std::numeric_limits<uint32_t>::max(), "", + DeviceInfo::MAX_WORK_SIZE); + + auto &MaxGrid = *Info.add("Maximum Grid Dimensions", std::monostate{}, "", + DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt); if (Res == CUDA_SUCCESS) MaxGrid.add("x", TmpInt); @@ -1016,7 +1139,8 @@ struct CUDADeviceTy : public GenericDeviceTy { Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt); if (Res == CUDA_SUCCESS) - Info.add("Clock Rate", TmpInt, "kHz"); + Info.add("Clock Rate", TmpInt / 1000, "MHz", + DeviceInfo::MAX_CLOCK_FREQUENCY); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt); if (Res == CUDA_SUCCESS) @@ -1053,7 +1177,8 @@ struct CUDADeviceTy : public GenericDeviceTy { Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt); if (Res == CUDA_SUCCESS) - Info.add("Memory Clock Rate", TmpInt, "kHz"); + Info.add("Memory Clock Rate", TmpInt / 1000, "MHz", + DeviceInfo::MEMORY_CLOCK_RATE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt); if (Res == CUDA_SUCCESS) @@ -1166,7 +1291,7 @@ private: // Perform a quick check for the named kernel in the image. The kernel // should be created by the 'nvptx-lower-ctor-dtor' pass. GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) + if (!Handler.isSymbolInImage(*this, Image, KernelName)) return Plugin::success(); // The Nvidia backend cannot handle creating the ctor / dtor array @@ -1201,8 +1326,12 @@ private: // Allocate a buffer to store all of the known constructor / destructor // functions in so we can iterate them on the device. - void *Buffer = + auto BufferOrErr = allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE); + if (!BufferOrErr) + return BufferOrErr.takeError(); + + void *Buffer = *BufferOrErr; if (!Buffer) return Plugin::error(ErrorCode::OUT_OF_RESOURCES, "failed to allocate memory for global buffer"); @@ -1251,12 +1380,10 @@ private: Error Err = Plugin::success(); AsyncInfoWrapper.finalize(Err); + if (Err) + return Err; - if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS) - return Plugin::error(ErrorCode::UNKNOWN, - "failed to free memory for global buffer"); - - return Err; + return free(Buffer, TARGET_ALLOC_DEVICE); } /// Stream manager for CUDA streams. @@ -1319,7 +1446,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem); if (auto Err = Plugin::check( AttrResult, - "Error in cuLaunchKernel while setting the memory limits: %s")) + "error in cuFuncSetAttribute while setting the memory limits: %s")) return Err; MaxDynCGroupMemLimit = MaxDynCGroupMem; } @@ -1453,7 +1580,7 @@ struct CUDAPluginTy final : public GenericPluginTy { unsigned SM = Header.e_ident[ELF::EI_ABIVERSION] == ELF::ELFABIVERSION_CUDA_V1 ? Header.e_flags & ELF::EF_CUDA_SM - : (Header.e_flags & ELF::EF_CUDA_SM_MASK) >> 8; + : (Header.e_flags & ELF::EF_CUDA_SM_MASK) >> ELF::EF_CUDA_SM_OFFSET; CUdevice Device; CUresult Res = cuDeviceGet(&Device, DeviceId); diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index ed5213531999..eb4ecac9907a 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy { return Plugin::success(); } + /// Return maximum block size for maximum occupancy + Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device, + uint64_t DynamicMemSize) const override { + return Plugin::error( + ErrorCode::UNSUPPORTED, + "occupancy calculations are not implemented for the host device"); + } + private: /// The kernel function to execute. void (*Func)(void); @@ -123,8 +131,8 @@ private: struct GenELF64DeviceImageTy : public DeviceImageTy { /// Create the GenELF64 image with the id and the target image pointer. GenELF64DeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, - const __tgt_device_image *TgtImage) - : DeviceImageTy(ImageId, Device, TgtImage), DynLib() {} + std::unique_ptr<MemoryBuffer> &&TgtImage) + : DeviceImageTy(ImageId, Device, std::move(TgtImage)), DynLib() {} /// Getter and setter for the dynamic library. DynamicLibrary &getDynamicLibrary() { return DynLib; } @@ -181,11 +189,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy { Error setContext() override { return Plugin::success(); } /// Load the binary image into the device and allocate an image object. - Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage, - int32_t ImageId) override { + Expected<DeviceImageTy *> + loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage, + int32_t ImageId) override { // Allocate and initialize the image object. GenELF64DeviceImageTy *Image = Plugin.allocate<GenELF64DeviceImageTy>(); - new (Image) GenELF64DeviceImageTy(ImageId, *this, TgtImage); + new (Image) GenELF64DeviceImageTy(ImageId, *this, std::move(TgtImage)); // Create a temporary file. char TmpFileName[] = "/tmp/tmpfile_XXXXXX"; @@ -231,7 +240,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy { } /// Allocate memory. Use std::malloc in all cases. - void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + Expected<void *> allocate(size_t Size, void *, TargetAllocTy Kind) override { if (Size == 0) return nullptr; @@ -241,7 +250,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy { case TARGET_ALLOC_DEVICE: case TARGET_ALLOC_HOST: case TARGET_ALLOC_SHARED: - case TARGET_ALLOC_DEVICE_NON_BLOCKING: MemAlloc = std::malloc(Size); break; } @@ -249,9 +257,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy { } /// Free the memory. Use std::free in all cases. - int free(void *TgtPtr, TargetAllocTy Kind) override { + Error free(void *TgtPtr, TargetAllocTy Kind) override { std::free(TgtPtr); - return OFFLOAD_SUCCESS; + return Plugin::success(); } /// This plugin does nothing to lock buffers. Do not return an error, just @@ -295,6 +303,28 @@ struct GenELF64DeviceTy : public GenericDeviceTy { "dataExchangeImpl not supported"); } + /// Insert a data fence between previous data operations and the following + /// operations. This is a no-op for Host devices as operations inserted into + /// a queue are in-order. + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (PatternSize == 1) { + std::memset(TgtPtr, *static_cast<const char *>(PatternPtr), Size); + } else { + for (unsigned int Step = 0; Step < Size; Step += PatternSize) { + auto *Dst = static_cast<char *>(TgtPtr) + Step; + std::memcpy(Dst, PatternPtr, PatternSize); + } + } + + return Plugin::success(); + } + /// All functions are already synchronous. No need to do anything on this /// synchronization function. Error synchronizeImpl(__tgt_async_info &AsyncInfo, @@ -314,11 +344,11 @@ struct GenELF64DeviceTy : public GenericDeviceTy { "initAsyncInfoImpl not supported"); } - /// This plugin does not support interoperability - Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { - return Plugin::error(ErrorCode::UNSUPPORTED, - "initDeviceInfoImpl not supported"); - } + Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, + AsyncInfoWrapperTy &AsyncInfo) override { + Callback(UserData); + return Plugin::success(); + }; /// This plugin does not support the event API. Do nothing without failing. Error createEventImpl(void **EventPtrStorage) override { @@ -337,6 +367,10 @@ struct GenELF64DeviceTy : public GenericDeviceTy { Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override { return true; } + Expected<bool> isEventCompleteImpl(void *Event, + AsyncInfoWrapperTy &AsyncInfo) override { + return true; + } Error syncEventImpl(void *EventPtr) override { return Plugin::success(); } /// Print information about the device. @@ -347,7 +381,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy { } /// This plugin should not setup the device environment or memory pool. - virtual bool shouldSetupDeviceEnvironment() const override { return false; }; virtual bool shouldSetupDeviceMemoryPool() const override { return false; }; /// Getters and setters for stack size and heap size not relevant. diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index f3e8e9a66685..c0290bfdab3f 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -83,6 +83,7 @@ config.test_format = lit.formats.ShTest() config.test_flags = " -I " + config.test_source_root + \ " -I " + config.omp_header_directory + \ " -L " + config.library_dir + \ + " -L " + config.llvm_library_intdir + \ " -L " + config.llvm_lib_directory # compiler specific flags @@ -165,11 +166,12 @@ else: # Unices config.test_flags += " -nogpulib" config.test_flags += " -Wl,-rpath," + config.library_dir config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory + config.test_flags += " -Wl,-rpath," + config.llvm_library_intdir config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory if config.cuda_libdir: config.test_flags += " -Wl,-rpath," + config.cuda_libdir if config.libomptarget_current_target.startswith('nvptx'): - config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.llvm_library_intdir + config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.llvm_library_intdir + "/nvptx64-nvidia-cuda" if config.libomptarget_current_target.endswith('-LTO'): config.test_flags += " -foffload-lto" if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env( diff --git a/offload/test/mapping/chained_containing_structs_1.cc b/offload/test/mapping/chained_containing_structs_1.cc new file mode 100644 index 000000000000..4dbb17140de1 --- /dev/null +++ b/offload/test/mapping/chained_containing_structs_1.cc @@ -0,0 +1,58 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic +// XFAIL: * + +#include <cstdlib> +#include <cstdio> +#include <cassert> + +struct S { + int a; + int b; + int c; +}; + +struct T { + S *s0; + S *s1; + S *s2; +}; + +int main() { + T *v = (T *) malloc (sizeof(T)); + v->s0 = (S *) malloc (sizeof(S)); + v->s1 = (S *) malloc (sizeof(S)); + v->s2 = (S *) malloc (sizeof(S)); + v->s0->a = 10; + v->s0->b = 10; + v->s0->c = 10; + v->s1->a = 20; + v->s1->b = 20; + v->s1->c = 20; + v->s2->a = 30; + v->s2->b = 30; + v->s2->c = 30; + +#pragma omp target map(to: v[:1]) map(tofrom: v->s1->b, v->s1->c, v->s2->b) + { + v->s1->b += 3; + v->s1->c += 5; + v->s2->b += 7; + } + + printf ("%d\n", v->s0->a); // CHECK: 10 + printf ("%d\n", v->s0->b); // CHECK: 10 + printf ("%d\n", v->s0->c); // CHECK: 10 + printf ("%d\n", v->s1->a); // CHECK: 20 + printf ("%d\n", v->s1->b); // CHECK: 23 + printf ("%d\n", v->s1->c); // CHECK: 25 + printf ("%d\n", v->s2->a); // CHECK: 30 + printf ("%d\n", v->s2->b); // CHECK: 37 + printf ("%d\n", v->s2->c); // CHECK: 30 + + free(v->s0); + free(v->s1); + free(v->s2); + free(v); + + return 0; +} diff --git a/offload/test/mapping/chained_containing_structs_2.cc b/offload/test/mapping/chained_containing_structs_2.cc new file mode 100644 index 000000000000..29c4c8b7fedf --- /dev/null +++ b/offload/test/mapping/chained_containing_structs_2.cc @@ -0,0 +1,76 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic +// XFAIL: * + +#include <cstdlib> +#include <cstdio> +#include <cassert> + +struct R { + int d; + int e; + int f; +}; + +struct S { + R *r0; + R *r1; + R *r2; +}; + +struct T { + S *s0; + S *s1; + S *s2; +}; + +int main() { + T *v = (T *) malloc (sizeof(T)); + + v->s0 = (S *) malloc (sizeof(S)); + v->s1 = (S *) malloc (sizeof(S)); + v->s2 = (S *) malloc (sizeof(S)); + + v->s0->r0 = (R *) calloc (1, sizeof(R)); + v->s0->r1 = (R *) calloc (1, sizeof(R)); + v->s0->r2 = (R *) calloc (1, sizeof(R)); + + v->s1->r0 = (R *) calloc (1, sizeof(R)); + v->s1->r1 = (R *) calloc (1, sizeof(R)); + v->s1->r2 = (R *) calloc (1, sizeof(R)); + + v->s2->r0 = (R *) calloc (1, sizeof(R)); + v->s2->r1 = (R *) calloc (1, sizeof(R)); + v->s2->r2 = (R *) calloc (1, sizeof(R)); + + #pragma omp target map(to: v->s1, v->s2, *v->s1, v->s1->r1, *v->s2, v->s2->r0) \ + map(tofrom: v->s1->r1->d, v->s1->r1->e, v->s1->r2->d, v->s1->r2->f, v->s2->r0->e) + { + v->s1->r1->d += 3; + v->s1->r1->e += 5; + v->s1->r2->d += 7; + v->s1->r2->f += 9; + v->s2->r0->e += 11; + } + + printf ("%d\n", v->s1->r1->d); // CHECK: 3 + printf ("%d\n", v->s1->r1->e); // CHECK: 5 + printf ("%d\n", v->s1->r2->d); // CHECK: 7 + printf ("%d\n", v->s1->r2->f); // CHECK: 9 + printf ("%d\n", v->s2->r0->e); // CHECK: 11 + + free(v->s0->r0); + free(v->s0->r1); + free(v->s0->r2); + free(v->s1->r0); + free(v->s1->r1); + free(v->s1->r2); + free(v->s2->r0); + free(v->s2->r1); + free(v->s2->r2); + free(v->s0); + free(v->s1); + free(v->s2); + free(v); + + return 0; +} diff --git a/offload/test/mapping/chained_containing_structs_3.cc b/offload/test/mapping/chained_containing_structs_3.cc new file mode 100644 index 000000000000..23555bf69110 --- /dev/null +++ b/offload/test/mapping/chained_containing_structs_3.cc @@ -0,0 +1,217 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <cstdlib> +#include <cstdio> +#include <cassert> +#include <cstring> + +#include <omp.h> + +struct R { + int d; + int e; + int f; +}; + +struct S { + int a; + int b; + struct { + int c; + R r; + R *rp; + } sub; + int g; +}; + +struct T { + int a; + int *ptr; + int b; +}; + +int main() { + R r; + R *rp = new R; + S s; + S *sp = new S; + T t; + T *tp = new T; + + memset(&r, 0, sizeof(R)); + memset(rp, 0, sizeof(R)); + memset(&s, 0, sizeof(S)); + memset(sp, 0, sizeof(S)); + memset(&t, 0, sizeof(T)); + memset(tp, 0, sizeof(T)); + + s.sub.rp = new R; + sp->sub.rp = new R; + + memset(s.sub.rp, 0, sizeof(R)); + memset(sp->sub.rp, 0, sizeof(R)); + + t.ptr = new int[10]; + tp->ptr = new int[10]; + + memset(t.ptr, 0, sizeof(int)*10); + memset(tp->ptr, 0, sizeof(int)*10); + +#pragma omp target map(tofrom: r) map(tofrom: r.e) +{ + r.d++; + r.e += 2; + r.f += 3; +} + printf ("%d\n", r.d); // CHECK: 1 + printf ("%d\n", r.e); // CHECK-NEXT: 2 + printf ("%d\n", r.f); // CHECK-NEXT: 3 + +#pragma omp target map(tofrom: rp[:1]) map(tofrom: rp->e) +{ + rp->d++; + rp->e += 2; + rp->f += 3; +} + + printf ("%d\n", rp->d); // CHECK-NEXT: 1 + printf ("%d\n", rp->e); // CHECK-NEXT: 2 + printf ("%d\n", rp->f); // CHECK-NEXT: 3 + + int v; + int *orig_addr_v = &v; + bool separate_memory_space; + +#pragma omp target data map(v) + { + void *mapped_ptr_v = + omp_get_mapped_ptr(orig_addr_v, omp_get_default_device()); + separate_memory_space = mapped_ptr_v != (void*) orig_addr_v; + } + + const char *mapping_flavour = separate_memory_space ? "separate" : "unified"; + +#pragma omp target map(to: s) map(tofrom: s.sub.r.e) +{ + s.b++; + s.sub.r.d+=2; + s.sub.r.e+=3; + s.sub.r.f+=4; +} + + printf ("%d/%s\n", s.b, mapping_flavour); + printf ("%d/%s\n", s.sub.r.d, mapping_flavour); + printf ("%d/%s\n", s.sub.r.e, mapping_flavour); + printf ("%d/%s\n", s.sub.r.f, mapping_flavour); + + // CHECK: {{0/separate|1/unified}} + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: 3 + // CHECK-NEXT: {{0/separate|4/unified}} + +#pragma omp target map(to: s, s.b) map(to: s.sub.rp[:1]) map(tofrom: s.sub.rp->e) +{ + s.b++; + s.sub.rp->d+=2; + s.sub.rp->e+=3; + s.sub.rp->f+=4; +} + + printf ("%d/%s\n", s.b, mapping_flavour); + printf ("%d/%s\n", s.sub.rp->d, mapping_flavour); + printf ("%d/%s\n", s.sub.rp->e, mapping_flavour); + printf ("%d/%s\n", s.sub.rp->f, mapping_flavour); + + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: 3 + // CHECK-NEXT: {{0/separate|4/unified}} + +#pragma omp target map(to: sp[:1]) map(tofrom: sp->sub.r.e) +{ + sp->b++; + sp->sub.r.d+=2; + sp->sub.r.e+=3; + sp->sub.r.f+=4; +} + + printf ("%d/%s\n", sp->b, mapping_flavour); + printf ("%d/%s\n", sp->sub.r.d, mapping_flavour); + printf ("%d/%s\n", sp->sub.r.e, mapping_flavour); + printf ("%d/%s\n", sp->sub.r.f, mapping_flavour); + + // CHECK-NEXT: {{0/separate|1/unified}} + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: 3 + // CHECK-NEXT: {{0/separate|4/unified}} + +#pragma omp target map(to: sp[:1]) map(to: sp->sub.rp[:1]) map(tofrom: sp->sub.rp->e) +{ + sp->b++; + sp->sub.rp->d+=2; + sp->sub.rp->e+=3; + sp->sub.rp->f+=4; +} + + printf ("%d/%s\n", sp->b, mapping_flavour); + printf ("%d/%s\n", sp->sub.rp->d, mapping_flavour); + printf ("%d/%s\n", sp->sub.rp->e, mapping_flavour); + printf ("%d/%s\n", sp->sub.rp->f, mapping_flavour); + + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: {{0/separate|2/unified}} + // CHECK-NEXT: 3 + // CHECK-NEXT: {{0/separate|4/unified}} + +#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1]) +{ + t.a++; + t.ptr[2]+=2; + t.b+=3; +} + + printf ("%d\n", t.a); // CHECK-NEXT: 1 + printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 2 + printf ("%d\n", t.b); // CHECK-NEXT: 3 + +#pragma omp target map(tofrom: t) map(tofrom: t.a) +{ + t.b++; +} + + printf ("%d\n", t.b); // CHECK-NEXT: 4 + +#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1], t.a) +{ + t.a++; + t.ptr[2]+=2; + t.b+=3; +} + + printf ("%d\n", t.a); // CHECK-NEXT: 2 + printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 4 + printf ("%d\n", t.b); // CHECK-NEXT: 7 + +#pragma omp target map(tofrom: t) map(tofrom: t.ptr[2:1], t.a) +{ + /* Empty */ +} + + printf ("%d\n", t.a); // CHECK-NEXT: 2 + printf ("%d\n", t.ptr[2]); // CHECK-NEXT: 4 + printf ("%d\n", t.b); // CHECK-NEXT: 7 + + delete s.sub.rp; + delete sp->sub.rp; + + delete[] t.ptr; + delete[] tp->ptr; + + delete rp; + delete sp; + delete tp; + + return 0; +} diff --git a/offload/test/mapping/data_member_ref.cpp b/offload/test/mapping/data_member_ref.cpp index fdb8abcaa650..7947a62c169f 100644 --- a/offload/test/mapping/data_member_ref.cpp +++ b/offload/test/mapping/data_member_ref.cpp @@ -60,7 +60,8 @@ int main() { printf("Host %d %d.\n", Bar.VRef.Data, V.Data); // CHECK: Host 123456. printf("Host %d.\n", *Baz.VRef.Data); -#pragma omp target map(*Baz.VRef.Data) map(from : D1, D2) +#pragma omp target map(Baz.VRef.Data) map(*Baz.VRef.Data) map(V1.Data[0 : 0]) \ + map(from : D1, D2) { // CHECK: Device 123456. D1 = *Baz.VRef.Data; diff --git a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp index c6c5657ae616..45fd042aedb0 100644 --- a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp +++ b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp @@ -44,8 +44,8 @@ int main() { int spp00fa = -1, spp00fca = -1, spp00fb_r = -1; __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]); -#pragma omp target map(tofrom: spp[0][0]) firstprivate(p) \ - map(from: spp00fa, spp00fca, spp00fb_r) +#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0]) firstprivate(p) \ + map(from : spp00fa, spp00fca, spp00fb_r) { spp00fa = spp[0][0].f.a; spp00fca = spp[0][0].f.c.a; diff --git a/offload/test/mapping/declare_mapper_nested_mappers.cpp b/offload/test/mapping/declare_mapper_nested_mappers.cpp index a9e3f05e0f5f..a59ed6980ec4 100644 --- a/offload/test/mapping/declare_mapper_nested_mappers.cpp +++ b/offload/test/mapping/declare_mapper_nested_mappers.cpp @@ -42,8 +42,8 @@ int main() { int spp00fa = -1, spp00fb_r = -1, spp00fg1 = -1, spp00fg_r = -1; __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]), p1 = reinterpret_cast<__intptr_t>(&y[0]); -#pragma omp target map(tofrom : spp[0][0]) firstprivate(p, p1) \ - map(from: spp00fa, spp00fb_r, spp00fg1, spp00fg_r) +#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0]) \ + firstprivate(p, p1) map(from : spp00fa, spp00fb_r, spp00fg1, spp00fg_r) { spp00fa = spp[0][0].f.a; spp00fb_r = spp[0][0].f.b == reinterpret_cast<void *>(p) ? 1 : 0; diff --git a/offload/test/mapping/lambda_by_value.cpp b/offload/test/mapping/lambda_by_value.cpp index 5516dedd72a9..4c0278d40592 100644 --- a/offload/test/mapping/lambda_by_value.cpp +++ b/offload/test/mapping/lambda_by_value.cpp @@ -1,4 +1,5 @@ -// RUN: %libomptarget-compilexx-run-and-check-generic +// RUN: %libomptarget-compileopt-generic -fno-exceptions +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic #include <stdint.h> #include <stdio.h> diff --git a/offload/test/mapping/map_back_race.cpp b/offload/test/mapping/map_back_race.cpp index 8a988d3be3b4..49bbe87e2449 100644 --- a/offload/test/mapping/map_back_race.cpp +++ b/offload/test/mapping/map_back_race.cpp @@ -2,6 +2,9 @@ // Taken from https://github.com/llvm/llvm-project/issues/54216 +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// UNSUPPORTED: gpu + #include <algorithm> #include <cstdlib> #include <iostream> diff --git a/offload/test/mapping/map_both_pointer_pointee.c b/offload/test/mapping/map_both_pointer_pointee.c index 7be1ba465e7d..1934b702dbba 100644 --- a/offload/test/mapping/map_both_pointer_pointee.c +++ b/offload/test/mapping/map_both_pointer_pointee.c @@ -1,11 +1,10 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda +// RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: unified_shared_memory // UNSUPPORTED: amdgcn-amd-amdhsa +// +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// XFAIL: nvidiagpu #pragma omp declare target int *ptr1; diff --git a/offload/test/mapping/map_ptr_and_star_global.c b/offload/test/mapping/map_ptr_and_star_global.c index c3b0dd2f49e6..869fb8ca9bc2 100644 --- a/offload/test/mapping/map_ptr_and_star_global.c +++ b/offload/test/mapping/map_ptr_and_star_global.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_ptr_and_star_local.c b/offload/test/mapping/map_ptr_and_star_local.c index f0ca84d1cc4d..97fa7cd53715 100644 --- a/offload/test/mapping/map_ptr_and_star_local.c +++ b/offload/test/mapping/map_ptr_and_star_local.c @@ -1,4 +1,9 @@ -// RUN: %libomptarget-compilexx-run-and-check-generic +// RUN: %libomptarget-compile-run-and-check-generic + +// REQUIRES: libc +// +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// XFAIL: gpu #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_ptr_and_subscript_global.c b/offload/test/mapping/map_ptr_and_subscript_global.c index a3a10b6c9b21..839db068aa90 100644 --- a/offload/test/mapping/map_ptr_and_subscript_global.c +++ b/offload/test/mapping/map_ptr_and_subscript_global.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_ptr_and_subscript_local.c b/offload/test/mapping/map_ptr_and_subscript_local.c index bb44999541a7..68ac9dc0917f 100644 --- a/offload/test/mapping/map_ptr_and_subscript_local.c +++ b/offload/test/mapping/map_ptr_and_subscript_local.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_structptr_and_member_global.c b/offload/test/mapping/map_structptr_and_member_global.c index 10e72e070dbc..f855e87d7218 100644 --- a/offload/test/mapping/map_structptr_and_member_global.c +++ b/offload/test/mapping/map_structptr_and_member_global.c @@ -1,4 +1,9 @@ -// RUN: %libomptarget-compilexx-run-and-check-generic +// RUN: %libomptarget-compile-run-and-check-generic + +// REQUIRES: libc +// +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// XFAIL: gpu #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_structptr_and_member_local.c b/offload/test/mapping/map_structptr_and_member_local.c index 9e59551ad3d6..bd9e2a89eb6f 100644 --- a/offload/test/mapping/map_structptr_and_member_local.c +++ b/offload/test/mapping/map_structptr_and_member_local.c @@ -1,4 +1,9 @@ -// RUN: %libomptarget-compilexx-run-and-check-generic +// RUN: %libomptarget-compile-run-and-check-generic + +// REQUIRES: libc +// +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// XFAIL: gpu #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/ptr_and_obj_motion.c b/offload/test/mapping/ptr_and_obj_motion.c index 8fa2c9865b4a..a94c07aadc1b 100644 --- a/offload/test/mapping/ptr_and_obj_motion.c +++ b/offload/test/mapping/ptr_and_obj_motion.c @@ -17,7 +17,7 @@ void init(double vertexx[]) { } void change(DV *dvptr) { -#pragma omp target map(dvptr->dataptr[0 : 100]) +#pragma omp target map(dvptr->dataptr[0 : 100]) map(alloc : dvptr -> dataptr) { printf("In change: %lf, expected 77.0\n", dvptr->dataptr[77]); dvptr->dataptr[77] += 1.0; diff --git a/offload/test/mapping/target_derefence_array_pointrs.cpp b/offload/test/mapping/target_derefence_array_pointrs.cpp index a6dd4069a8f5..d213c8744363 100644 --- a/offload/test/mapping/target_derefence_array_pointrs.cpp +++ b/offload/test/mapping/target_derefence_array_pointrs.cpp @@ -18,23 +18,24 @@ void foo(int **t1d) { for (j = 0; j < 3; j++) (*t1d)[j] = 0; -#pragma omp target map(tofrom : (*t1d)[0 : 3]) +#pragma omp target map(tofrom : (*t1d)[0 : 3]) map(alloc : *t1d) { (*t1d)[1] = 1; } // CHECK: 1 printf("%d\n", (*t1d)[1]); -#pragma omp target map(tofrom : (**t2d)[0 : 3]) +#pragma omp target map(tofrom : (**t2d)[0 : 3]) map(alloc : **t2d, *t2d) { (**t2d)[1] = 2; } // CHECK: 2 printf("%d\n", (**t2d)[1]); -#pragma omp target map(tofrom : (***t3d)[0 : 3]) +#pragma omp target map(tofrom : (***t3d)[0 : 3]) \ + map(alloc : ***t3d, **t3d, *t3d) { (***t3d)[1] = 3; } // CHECK: 3 printf("%d\n", (***t3d)[1]); -#pragma omp target map(tofrom : (**t1d)) +#pragma omp target map(tofrom : (**t1d)) map(alloc : *t1d) { (*t1d)[0] = 4; } // CHECK: 4 printf("%d\n", (*t1d)[0]); -#pragma omp target map(tofrom : (*(*(t1d + a) + b))) +#pragma omp target map(tofrom : (*(*(t1d + a) + b))) map(to : *(t1d + a)) { *(*(t1d + a) + b) = 5; } // CHECK: 5 printf("%d\n", *(*(t1d + a) + b)); @@ -49,7 +50,7 @@ void bar() { for (int i = 0; i < 3; i++) { (**a)[1] = i; } -#pragma omp target map((**a)[ : 3]) +#pragma omp target map((**a)[ : 3]) map(alloc : **a, *a) { (**a)[1] = 6; // CHECK: 6 @@ -73,7 +74,8 @@ void zoo(int **f, SSA *sa) { *(f + sa->i + 1) = t; *(sa->sa->i + *(f + sa->i + 1)) = 4; printf("%d\n", *(sa->sa->i + *(1 + sa->i + f))); -#pragma omp target map(sa, *(sa->sa->i + *(1 + sa->i + f))) +#pragma omp target map(*(sa->sa->i + *(1 + sa->i + f))) map(alloc : sa->sa) \ + map(to : sa->i) map(to : sa->sa->i) map(to : *(1 + sa->i + f)) { *(sa->sa->i + *(1 + sa->i + f)) = 7; } // CHECK: 7 printf("%d\n", *(sa->sa->i + *(1 + sa->i + f))); @@ -87,13 +89,13 @@ void xoo() { void yoo(int **x) { *x = (int *)malloc(2 * sizeof(int)); -#pragma omp target map(**x) +#pragma omp target map(**x) map(alloc : *x) { **x = 8; // CHECK: 8 printf("%d\n", **x); } -#pragma omp target map(*(*x + 1)) +#pragma omp target map(*(*x + 1)) map(alloc : *x) { *(*x + 1) = 9; // CHECK: 9 diff --git a/offload/test/mapping/target_has_device_addr.c b/offload/test/mapping/target_has_device_addr.c index e8bfff868c7e..f238832c4405 100644 --- a/offload/test/mapping/target_has_device_addr.c +++ b/offload/test/mapping/target_has_device_addr.c @@ -66,8 +66,9 @@ void zoo() { short **xpp = &xp[0]; x[1] = 111; -#pragma omp target data map(tofrom : xpp[1][1]) use_device_addr(xpp[1][1]) -#pragma omp target has_device_addr(xpp[1][1]) +#pragma omp target data map(tofrom : xpp[1][1]) map(xpp[1]) \ + use_device_addr(xpp[1]) +#pragma omp target has_device_addr(xpp[1]) { xpp[1][1] = 222; // CHECK: 222 diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp new file mode 100644 index 000000000000..3b1a8192bf2c --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp @@ -0,0 +1,85 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5]) + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa02 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa02 != mapped_ptr_paa02); + +// (A) use_device_addr operand within mapped address range. +// CHECK: A: 1 +#pragma omp target data use_device_addr(ph[3 : 4]) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_addr operand in extended address range, but not +// mapped address range. +// CHECK: B: 1 +#pragma omp target data use_device_addr(ph[2]) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) use_device_addr/map: same base-array, different first-location. +// CHECK: C: 1 +#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1]) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) use_device_addr/map: different base-array/pointers. +// CHECK: D: 1 +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) use_device_addr operand within mapped range of previous map. +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa[0]) + printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (F) use_device_addr/map: different operands, same base-array. +// CHECK: F: 1 +#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2]) + printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (G) use_device_addr/map: different base-array/pointers. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2]) + printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp new file mode 100644 index 000000000000..b9ebde431e7b --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp @@ -0,0 +1,143 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +// (A) No corresponding map, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (B) use_device_addr/map: different operands, same base-pointer. +// use_device_addr operand within mapped address range. +// CHECK: B: 1 1 1 +#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1]) + { + int *mapped_ptr_ph4 = + (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr, + mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4); + } + +// (C) use_device_addr/map: different base-pointers. +// No corresponding storage, lookup should fail. +// CHECK: C: 1 1 1 +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (D) use_device_addr/map: one of two maps with matching base-pointer. +// use_device_addr operand within mapped address range of second map, +// lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding map, lookup should fail +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == (int **)nullptr + 2); + } + +// (F) use_device_addr/map: different operands, same base-array. +// use_device_addr within mapped address range. Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + +// (G) use_device_addr/map: different operands, same base-array. +// use_device_addr extends beyond existing mapping. Not spec compliant. +// But the lookup succeeds because we use the base-address for translation. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[0][4]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr( + original_paa02 + 2, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr, + mapped_ptr_paa04 != original_paa02 + 2, + &paa[0][4] == mapped_ptr_paa04); + } + + int *original_paa020 = &paa[0][2][0]; + int **original_paa0 = (int **)&paa[0]; + +// (H) use_device_addr/map: different base-pointers. +// No corresponding storage for use_device_addr opnd, lookup should fail. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa020 = + (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device()); + int **mapped_ptr_paa0 = + (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr, + mapped_ptr_paa0 == nullptr, &paa[0] == nullptr); + } + +// (I) use_device_addr/map: one map with different, one with same base-ptr. +// Lookup should succeed. +// CHECK: I: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp new file mode 100644 index 000000000000..e9a1124bc461 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp @@ -0,0 +1,98 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section on a reference. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5]) + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa02 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa02 != mapped_ptr_paa02); + +// (A) use_device_addr operand within mapped address range. +// EXPECTED: A: 1 +// CHECK: A: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_addr(ph[3 : 4]) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_addr operand in extended address range, but not +// mapped address range. +// EXPECTED: B: 1 +// CHECK: B: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_addr(ph[2]) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) use_device_addr/map: same base-array, different first-location. +// EXPECTED: C: 1 +// CHECK: C: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1]) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) use_device_addr/map: different base-array/pointers. +// EXPECTED: D: 1 +// CHECK: D: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) use_device_addr operand within mapped range of previous map. +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa[0]) + printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (F) use_device_addr/map: different operands, same base-array. +// CHECK: F: 1 +#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2]) + printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (G) use_device_addr/map: different base-array/pointers. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2]) + printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp new file mode 100644 index 000000000000..0090cdb09536 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp @@ -0,0 +1,158 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section on a reference. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +// (A) No corresponding map, lookup should fail. +// EXPECTED: A: 1 1 1 +// CHECK: A: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (B) use_device_addr/map: different operands, same base-pointer. +// use_device_addr operand within mapped address range. +// EXPECTED: B: 1 1 1 +// CHECK: B: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1]) + { + int *mapped_ptr_ph4 = + (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr, + mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4); + } + +// (C) use_device_addr/map: different base-pointers. +// No corresponding storage, lookup should fail. +// EXPECTED: C: 1 1 1 +// CHECK: C: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (D) use_device_addr/map: one of two maps with matching base-pointer. +// use_device_addr operand within mapped address range of second map, +// lookup should succeed. +// EXPECTED: D: 1 1 1 +// CHECK: D: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding map, lookup should fail +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == (int **)nullptr + 2); + } + +// (F) use_device_addr/map: different operands, same base-array. +// use_device_addr within mapped address range. Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + +// (G) use_device_addr/map: different operands, same base-array. +// use_device_addr extends beyond existing mapping. Not spec compliant. +// But the lookup succeeds because we use the base-address for translation. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[0][4]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr( + original_paa02 + 2, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr, + mapped_ptr_paa04 != original_paa02 + 2, + &paa[0][4] == mapped_ptr_paa04); + } + + int *original_paa020 = &paa[0][2][0]; + int **original_paa0 = (int **)&paa[0]; + +// (H) use_device_addr/map: different base-pointers. +// No corresponding storage for use_device_addr opnd, lookup should fail. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa020 = + (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device()); + int **mapped_ptr_paa0 = + (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr, + mapped_ptr_paa0 == nullptr, &paa[0] == nullptr); + } + +// (I) use_device_addr/map: one map with different, one with same base-ptr. +// Lookup should succeed. +// CHECK: I: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp new file mode 100644 index 000000000000..883297f7e90c --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp @@ -0,0 +1,93 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a variable (not a section). +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +#pragma omp target enter data map(to : g, h, ph, paa) + void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device()); + void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device()); + void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device()); + void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device()); + + // CHECK-COUNT-8: 1 + printf("%d\n", mapped_ptr_g != nullptr); + printf("%d\n", mapped_ptr_h != nullptr); + printf("%d\n", mapped_ptr_ph != nullptr); + printf("%d\n", mapped_ptr_paa != nullptr); + printf("%d\n", original_addr_g != mapped_ptr_g); + printf("%d\n", original_addr_h != mapped_ptr_h); + printf("%d\n", original_addr_ph != mapped_ptr_ph); + printf("%d\n", original_addr_paa != mapped_ptr_paa); + +// (A) +// CHECK: A: 1 +#pragma omp target data use_device_addr(g) + printf("A: %d\n", mapped_ptr_g == &g); + +// (B) +// CHECK: B: 1 +#pragma omp target data use_device_addr(h) + printf("B: %d\n", mapped_ptr_h == &h); + +// (C) +// CHECK: C: 1 +#pragma omp target data use_device_addr(ph) + printf("C: %d\n", mapped_ptr_ph == &ph); + +// (D) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &ph, not &ph[0/1]. +// CHECK: D: 1 +#pragma omp target data map(ph[1 : 2]) use_device_addr(ph) + printf("D: %d\n", mapped_ptr_ph == &ph); + +// (E) +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa) + printf("E: %d\n", mapped_ptr_paa == &paa); + +// (F) use_device_addr/map with same base-array, paa. +// Address translation should happen for &paa. +// CHECK: F: 1 +#pragma omp target data map(paa[0][2]) use_device_addr(paa) + printf("F: %d\n", mapped_ptr_paa == &paa); + +// (G) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &paa. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + printf("G: %d\n", mapped_ptr_paa == &paa); + +#pragma omp target exit data map(release : g, h, ph, paa) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp new file mode 100644 index 000000000000..79c6f69edba8 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp @@ -0,0 +1,159 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a variable (not a section). +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +// (A) No corresponding item, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_g == nullptr, + mapped_ptr_g != original_addr_g, (void *)&g == nullptr); + } + +// (B) Lookup should succeed. +// CHECK: B: 1 1 1 +#pragma omp target data map(g) use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_g != nullptr, + mapped_ptr_g != original_addr_g, &g == mapped_ptr_g); + } + +// (C) No corresponding item, lookup should fail. +// CHECK: C: 1 1 1 +#pragma omp target data use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_h == nullptr, + mapped_ptr_h != original_addr_h, (void *)&h == nullptr); + } + +// (D) Lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(h) use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_h != nullptr, + mapped_ptr_h != original_addr_h, &h == mapped_ptr_h); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (F) Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (G) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: G: 1 1 1 +#pragma omp target data map(ph[0 : 1]) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (H) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (I) No corresponding item, lookup should fail. +// CHECK: I: 1 1 1 +#pragma omp target data use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (J) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: J: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("J: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (K) Lookup should succeed. +// CHECK: K: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("K: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + +// (L) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: L: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("L: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp new file mode 100644 index 000000000000..f018c65f36ec --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp @@ -0,0 +1,100 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a reference variable. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +#pragma omp target enter data map(to : g, h, ph, paa) + void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device()); + void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device()); + void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device()); + void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device()); + + // CHECK-COUNT-8: 1 + printf("%d\n", mapped_ptr_g != nullptr); + printf("%d\n", mapped_ptr_h != nullptr); + printf("%d\n", mapped_ptr_ph != nullptr); + printf("%d\n", mapped_ptr_paa != nullptr); + printf("%d\n", original_addr_g != mapped_ptr_g); + printf("%d\n", original_addr_h != mapped_ptr_h); + printf("%d\n", original_addr_ph != mapped_ptr_ph); + printf("%d\n", original_addr_paa != mapped_ptr_paa); + +// (A) +// CHECK: A: 1 +#pragma omp target data use_device_addr(g) + printf("A: %d\n", mapped_ptr_g == &g); + +// (B) +// CHECK: B: 1 +#pragma omp target data use_device_addr(h) + printf("B: %d\n", mapped_ptr_h == &h); + +// (C) +// CHECK: C: 1 +#pragma omp target data use_device_addr(ph) + printf("C: %d\n", mapped_ptr_ph == &ph); + +// (D) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &ph, not &ph[0/1]. +// CHECK: D: 1 +#pragma omp target data map(ph[1 : 2]) use_device_addr(ph) + printf("D: %d\n", mapped_ptr_ph == &ph); + +// (E) +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa) + printf("E: %d\n", mapped_ptr_paa == &paa); + +// (F) use_device_addr/map with same base-array, paa. +// Address translation should happen for &paa. +// CHECK: F: 1 +#pragma omp target data map(paa[0][2]) use_device_addr(paa) + printf("F: %d\n", mapped_ptr_paa == &paa); + +// (G) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &paa. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + printf("G: %d\n", mapped_ptr_paa == &paa); + +#pragma omp target exit data map(release : g, h, ph, paa) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp new file mode 100644 index 000000000000..9360db419504 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp @@ -0,0 +1,166 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a reference variable. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +// (A) No corresponding item, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_g == nullptr, + mapped_ptr_g != original_addr_g, (void *)&g == nullptr); + } + +// (B) Lookup should succeed. +// CHECK: B: 1 1 1 +#pragma omp target data map(g) use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_g != nullptr, + mapped_ptr_g != original_addr_g, &g == mapped_ptr_g); + } + +// (C) No corresponding item, lookup should fail. +// CHECK: C: 1 1 1 +#pragma omp target data use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_h == nullptr, + mapped_ptr_h != original_addr_h, (void *)&h == nullptr); + } + +// (D) Lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(h) use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_h != nullptr, + mapped_ptr_h != original_addr_h, &h == mapped_ptr_h); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (F) Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (G) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: G: 1 1 1 +#pragma omp target data map(ph[0 : 1]) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (H) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (I) No corresponding item, lookup should fail. +// CHECK: I: 1 1 1 +#pragma omp target data use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (J) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: J: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("J: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (K) Lookup should succeed. +// CHECK: K: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("K: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + +// (L) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: L: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("L: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/target_use_device_addr.c b/offload/test/mapping/use_device_addr/target_use_device_addr.c index 5c2bb8a48f6e..4a9dbe252f76 100644 --- a/offload/test/mapping/target_use_device_addr.c +++ b/offload/test/mapping/use_device_addr/target_use_device_addr.c @@ -12,7 +12,9 @@ int main() { printf("%d, %p\n", xp[1], &xp[1]); #pragma omp target data use_device_addr(xp[1 : 3]) map(tofrom : x) #pragma omp target is_device_ptr(xp) - { xp[1] = 222; } + { + xp[1] = 222; + } // CHECK: 222 printf("%d, %p\n", xp[1], &xp[1]); } diff --git a/offload/test/mapping/target_wrong_use_device_addr.c b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c index 7a5babd69253..28ec6857fa1a 100644 --- a/offload/test/mapping/target_wrong_use_device_addr.c +++ b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c @@ -14,7 +14,7 @@ int main() { // CHECK: host addr=0x[[#%x,HOST_ADDR:]] fprintf(stderr, "host addr=%p\n", x); -#pragma omp target data map(to : x [0:10]) +#pragma omp target data map(to : x[0 : 10]) { // CHECK: omptarget device 0 info: variable x does not have a valid device // counterpart @@ -27,4 +27,3 @@ int main() { return 0; } - diff --git a/offload/test/mapping/array_section_use_device_ptr.c b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c index 86e2875c35c4..4cfcce28c112 100644 --- a/offload/test/mapping/array_section_use_device_ptr.c +++ b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c @@ -20,7 +20,9 @@ int main() { float *A_dev = NULL; #pragma omp target data use_device_ptr(A) - { A_dev = A; } + { + A_dev = A; + } #pragma omp target exit data map(delete : A[FROM : LENGTH]) // CHECK: Success diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp new file mode 100644 index 000000000000..a7745de53298 --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp @@ -0,0 +1,100 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a variable. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int h[10]; +int *ph = &h[0]; + +struct S { + int (*paa)[10][10] = &aa; + + void f1(int i) { + paa--; + void *original_ph3 = &ph[3]; + void *original_paa102 = &paa[1][0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5]) + void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa102 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa102 != mapped_ptr_paa102); + +// (A) Mapped data is within extended address range. Lookup should succeed. +// CHECK: A: 1 +#pragma omp target data use_device_ptr(ph) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_ptr/map on pointer, and pointee already exists. +// Lookup should succeed. +// CHECK: B: 1 +#pragma omp target data map(ph) use_device_ptr(ph) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: C: 1 +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: D: 1 +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) Mapped data is within extended address range. Lookup should succeed. +// Lookup should succeed. +// CHECK: E: 1 +#pragma omp target data use_device_ptr(paa) + printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (F) use_device_ptr/map on pointer, and pointee already exists. +// &paa[0] should be in extended address-range of the existing paa[1][...] +// Lookup should succeed. +// FIXME: However, it currently does not. Might need an RT fix. +// EXPECTED: F: 1 +// CHECK: F: 0 +#pragma omp target data map(paa) use_device_ptr(paa) + printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp new file mode 100644 index 000000000000..fe3cdb56e4ba --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp @@ -0,0 +1,125 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a variable. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int h[10]; +int *ph = &h[0]; + +struct S { + int (*paa)[10][10] = &aa; + + void f1(int i) { + paa--; + void *original_addr_ph3 = &ph[3]; + void *original_addr_paa102 = &paa[1][0][2]; + +// (A) No corresponding item, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (B) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// CHECK: B: 1 1 1 +#pragma omp target data map(ph) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: C: 1 1 1 +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (F) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp new file mode 100644 index 000000000000..66e65de4195a --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp @@ -0,0 +1,111 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a reference variable. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int (*paa_ptee)[10][10] = &aa; + +int h[10]; +int *ph_ptee = &h[0]; +int *&ph = ph_ptee; + +struct S { + int (*&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa--; + void *original_ph3 = &ph[3]; + void *original_paa102 = &paa[1][0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5]) + void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa102 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa102 != mapped_ptr_paa102); + +// (A) Mapped data is within extended address range. Lookup should succeed. +// EXPECTED: A: 1 +// CHECK: A: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_ptr(ph) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_ptr/map on pointer, and pointee already exists. +// Lookup should succeed. +// EXPECTED: B: 1 +// CHECK: B: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_ptr(ph) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: C: 1 +// CHECK: C: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: D: 1 +// CHECK: D: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) Mapped data is within extended address range. Lookup should succeed. +// Lookup should succeed. +// CHECK: E: 1 +#pragma omp target data use_device_ptr(paa) + printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (F) use_device_ptr/map on pointer, and pointee already exists. +// &paa[0] should be in extended address-range of the existing paa[1][...] +// Lookup should succeed. +// FIXME: However, it currently does not. Might need an RT fix. +// EXPECTED: F: 1 +// CHECK: F: 0 +#pragma omp target data map(paa) use_device_ptr(paa) + printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp new file mode 100644 index 000000000000..419ab3eb33d4 --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp @@ -0,0 +1,136 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a reference variable. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int (*paa_ptee)[10][10] = &aa; + +int h[10]; +int *ph_ptee = &h[0]; +int *&ph = ph_ptee; + +struct S { + int (*&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa--; + void *original_addr_ph3 = &ph[3]; + void *original_addr_paa102 = &paa[1][0][2]; + +// (A) No corresponding item, lookup should fail. +// EXPECTED: A: 1 1 1 +// CHECK: A: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (B) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// EXPECTED: B: 1 1 1 +// CHECK: B: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: C: 1 1 1 +// CHECK: C: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: D: 1 1 1 +// CHECK: D: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (F) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu index 1f84a0e1288d..b2e1edf51e17 100644 --- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu +++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu @@ -5,10 +5,10 @@ // RUN: %t | %fcheck-generic // clang-format on -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: x86_64-unknown-linux-gnu -// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO +// REQUIRES: gpu +// +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// XFAIL: gpu #include <stdio.h> diff --git a/offload/test/offloading/bug51781.c b/offload/test/offloading/bug51781.c index 2f30b035afbb..ff7fa51aafc2 100644 --- a/offload/test/offloading/bug51781.c +++ b/offload/test/offloading/bug51781.c @@ -16,6 +16,7 @@ // the generic state machine. // // RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt \ +// RUN: -Xoffload-linker -mllvm=-openmp-opt-disable-spmdization \ // RUN: -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom @@ -24,7 +25,9 @@ // Repeat with reduction clause, which has managed to break the custom state // machine in the past. // -// RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt -DADD_REDUCTION \ +// RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt \ +// RUN: -DADD_REDUCTION \ +// RUN: -Xoffload-linker -mllvm=-openmp-opt-disable-spmdization \ // RUN: -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom diff --git a/offload/test/offloading/force-usm.cpp b/offload/test/offloading/force-usm.cpp index a043ba47f54a..9988c3dc4e9e 100644 --- a/offload/test/offloading/force-usm.cpp +++ b/offload/test/offloading/force-usm.cpp @@ -48,7 +48,7 @@ int main(void) { // clang-format off // NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4 -// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12 +// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12 // NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4 // NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI // NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4 diff --git a/offload/test/offloading/fortran/declare-target-automap.f90 b/offload/test/offloading/fortran/declare-target-automap.f90 new file mode 100644 index 000000000000..b44c0b281527 --- /dev/null +++ b/offload/test/offloading/fortran/declare-target-automap.f90 @@ -0,0 +1,40 @@ +!Offloading test for AUTOMAP modifier in declare target enter +! REQUIRES: flang, amdgpu + +! FIXME: https://github.com/llvm/llvm-project/issues/161265 +! XFAIL: amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program automap_program + use iso_c_binding, only: c_loc + use omp_lib, only: omp_get_default_device, omp_target_is_present + integer, parameter :: N = 10 + integer :: i + integer, allocatable, target :: automap_array(:) + !$omp declare target enter(automap:automap_array) + + ! false since the storage is not present even though the descriptor is present + write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device()) + ! CHECK: 0 + + allocate (automap_array(N)) + ! true since the storage should be allocated and reference count incremented by the allocate + write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device()) + ! CHECK: 1 + + ! since storage is present this should not be a runtime error + !$omp target teams loop + do i = 1, N + automap_array(i) = i + end do + + !$omp target update from(automap_array) + write (*, *) automap_array + ! CHECK: 1 2 3 4 5 6 7 8 9 10 + + deallocate (automap_array) + + ! automap_array should have it's storage unmapped on device here + write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device()) + ! CHECK: 0 +end program diff --git a/offload/test/offloading/fortran/descriptor-stack-jam-regression.f90 b/offload/test/offloading/fortran/descriptor-stack-jam-regression.f90 new file mode 100644 index 000000000000..45a18b7f38ed --- /dev/null +++ b/offload/test/offloading/fortran/descriptor-stack-jam-regression.f90 @@ -0,0 +1,101 @@ +! This test doesn't expect any results, the pass condition is running to completion +! without any memory access errors on device or mapping issues from descriptor +! collisions due to local descriptors being placed on device and not being unampped +! before a subsequent local descriptor residing at the same address is mapped to +! device. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +module test +contains + subroutine kernel_1d(array) + implicit none + real, dimension(:) :: array + integer :: i + + !$omp target enter data map(alloc:array) + !$omp target teams distribute parallel do + do i=1, ubound(array, 1) + array(i) = 42.0 + end do + !$omp target update from(array) + end subroutine + + subroutine kernel_2d(array) + implicit none + real, dimension(:,:) :: array + integer :: i, j + + !$omp target enter data map(alloc:array) + !$omp target teams distribute parallel do collapse(2) + do j=1, ubound(array, 2) + do i=1, ubound(array, 1) + array(i,j) = 42.0 + end do + end do + !$omp target update from(array) + end subroutine + + subroutine kernel_3d(array) + implicit none + real, dimension(:,:,:) :: array + integer :: i, j, k + + !$omp target enter data map(alloc:array) + !$omp target teams distribute parallel do collapse(3) + do k=1, ubound(array, 3) + do j=1, ubound(array, 2) + do i=1, ubound(array, 1) + array(i,j,k) = 42.0 + end do + end do + end do + !$omp target update from(array) + end subroutine + + subroutine kernel_4d(array) + implicit none + real, dimension(:,:,:,:) :: array + integer :: i, j, k, l + + !$omp target enter data map(alloc:array) + !$omp target teams distribute parallel do collapse(4) + do l=1, ubound(array, 4) + do k=1, ubound(array, 3) + do j=1, ubound(array, 2) + do i=1, ubound(array, 1) + array(i,j,k,l) = 42.0 + end do + end do + end do + enddo + !$omp target update from(array) + end subroutine +end module + +program main + use test + implicit none + integer, parameter :: n = 2 + real :: array1(n) + real :: array2(n,n) + real :: array3(n,n,n) + real :: array4(n,n,n,n) + + call kernel_1d(array1) + call kernel_2d(array2) + call kernel_3d(array3) + call kernel_4d(array4) + + print *, array1 + print *, array2 + print *, array3 + print *, array4 + print *, "PASS" +end program + +! CHECK: 42. 42. +! CHECK: 42. 42. 42. 42. +! CHECK: 42. 42. 42. 42. 42. 42. 42. 42. +! CHECK: 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. +! CHECK: PASS diff --git a/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90 b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90 new file mode 100644 index 000000000000..c6f576acb90b --- /dev/null +++ b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90 @@ -0,0 +1,53 @@ +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic -fdo-concurrent-to-openmp=device +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +module saxpymod + use iso_fortran_env + public :: saxpy +contains + +subroutine saxpy(a, x, y, n, m) + use iso_fortran_env + implicit none + integer,intent(in) :: n, m + real(kind=real32),intent(in) :: a + real(kind=real32), dimension(:,:),intent(in) :: x + real(kind=real32), dimension(:,:),intent(inout) :: y + integer :: i, j + + do concurrent(i=1:n, j=1:m) + y(i,j) = a * x(i,j) + y(i,j) + end do + + write(*,*) "plausibility check:" + write(*,'("y(1,1) ",f8.6)') y(1,1) + write(*,'("y(n,m) ",f8.6)') y(n,m) +end subroutine saxpy + +end module saxpymod + +program main + use iso_fortran_env + use saxpymod, ONLY:saxpy + implicit none + + integer,parameter :: n = 1000, m=10000 + real(kind=real32), allocatable, dimension(:,:) :: x, y + real(kind=real32) :: a + integer :: i + + allocate(x(1:n,1:m), y(1:n,1:m)) + a = 2.0_real32 + x(:,:) = 1.0_real32 + y(:,:) = 2.0_real32 + + call saxpy(a, x, y, n, m) + + deallocate(x,y) +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: plausibility check: +! CHECK: y(1,1) 4.0 +! CHECK: y(n,m) 4.0 diff --git a/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90 b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90 new file mode 100644 index 000000000000..e094a1d7459e --- /dev/null +++ b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90 @@ -0,0 +1,53 @@ +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic -fdo-concurrent-to-openmp=device +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +module saxpymod + use iso_fortran_env + public :: saxpy +contains + +subroutine saxpy(a, x, y, n) + use iso_fortran_env + implicit none + integer,intent(in) :: n + real(kind=real32),intent(in) :: a + real(kind=real32), dimension(:),intent(in) :: x + real(kind=real32), dimension(:),intent(inout) :: y + integer :: i + + do concurrent(i=1:n) + y(i) = a * x(i) + y(i) + end do + + write(*,*) "plausibility check:" + write(*,'("y(1) ",f8.6)') y(1) + write(*,'("y(n) ",f8.6)') y(n) +end subroutine saxpy + +end module saxpymod + +program main + use iso_fortran_env + use saxpymod, ONLY:saxpy + implicit none + + integer,parameter :: n = 10000000 + real(kind=real32), allocatable, dimension(:) :: x, y + real(kind=real32) :: a + integer :: i + + allocate(x(1:n), y(1:n)) + a = 2.0_real32 + x(:) = 1.0_real32 + y(:) = 2.0_real32 + + call saxpy(a, x, y, n) + + deallocate(x,y) +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: plausibility check: +! CHECK: y(1) 4.0 +! CHECK: y(n) 4.0 diff --git a/offload/test/offloading/fortran/dtype-char-array-map-2.f90 b/offload/test/offloading/fortran/dtype-char-array-map-2.f90 new file mode 100644 index 000000000000..f17ea9e53853 --- /dev/null +++ b/offload/test/offloading/fortran/dtype-char-array-map-2.f90 @@ -0,0 +1,25 @@ +! Offloading test that verifies certain type of character string arrays +! map to and from device without problem. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + implicit none + type char_t + CHARACTER(LEN=16), dimension(10,10) :: char_arr + end type char_t + type(char_t) :: dtype_char + +!$omp target enter data map(alloc:dtype_char%char_arr) + +!$omp target + dtype_char%char_arr(2,2) = 'c' +!$omp end target + +!$omp target update from(dtype_char%char_arr) + + + print *, dtype_char%char_arr(2,2) +end program + +!CHECK: c diff --git a/offload/test/offloading/fortran/dtype-char-array-map.f90 b/offload/test/offloading/fortran/dtype-char-array-map.f90 new file mode 100644 index 000000000000..6b72c9e95101 --- /dev/null +++ b/offload/test/offloading/fortran/dtype-char-array-map.f90 @@ -0,0 +1,27 @@ +! Offloading test that verifies certain type of character string arrays +! (in this case allocatable) map to and from device without problem. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + implicit none + type char_t + CHARACTER(LEN=16), dimension(:,:), allocatable :: char_arr + end type char_t + type(char_t) :: dtype_char + + allocate(dtype_char%char_arr(10,10)) + +!$omp target enter data map(alloc:dtype_char%char_arr) + +!$omp target + dtype_char%char_arr(2,2) = 'c' +!$omp end target + +!$omp target update from(dtype_char%char_arr) + + + print *, dtype_char%char_arr(2,2) +end program + +!CHECK: c diff --git a/offload/test/offloading/fortran/target-declare-mapper-allocatable.f90 b/offload/test/offloading/fortran/target-declare-mapper-allocatable.f90 new file mode 100644 index 000000000000..d8d5e1b5631a --- /dev/null +++ b/offload/test/offloading/fortran/target-declare-mapper-allocatable.f90 @@ -0,0 +1,48 @@ +! This test validates that declare mapper for a derived type with an +! allocatable component preserves TO/FROM semantics for the component, +! ensuring the payload is copied back to the host on target exit. + +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic + +program target_declare_mapper_allocatable + implicit none + + type :: real_t + real, allocatable :: real_arr(:) + end type real_t + + ! Map the allocatable array payload via a named mapper. + !$omp declare mapper (xyz : real_t :: t) map(tofrom: t%real_arr) + + type(real_t) :: r + integer :: i + logical :: ok + + allocate(r%real_arr(10)) + r%real_arr = 1.0 + + !$omp target map(mapper(xyz), tofrom: r) + do i = 1, size(r%real_arr) + r%real_arr(i) = 3.0 + end do + !$omp end target + + ok = .true. + do i = 1, size(r%real_arr) + if (r%real_arr(i) /= 3.0) ok = .false. + end do + if (ok) then + print *, "Test passed!" + else + print *, "Test failed!" + do i = 1, size(r%real_arr) + print *, r%real_arr(i) + end do + end if + + deallocate(r%real_arr) +end program target_declare_mapper_allocatable + +! CHECK: Test passed! diff --git a/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90 b/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90 new file mode 100644 index 000000000000..65e04af66e02 --- /dev/null +++ b/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90 @@ -0,0 +1,43 @@ +! This test validates that declare mapper for a derived type that extends +! a parent type with an allocatable component correctly maps the nested +! allocatable payload via the mapper when the whole object is mapped on +! target. + +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic + +program target_declare_mapper_parent_allocatable + implicit none + + type, abstract :: base_t + real, allocatable :: base_arr(:) + end type base_t + + type, extends(base_t) :: real_t + real, allocatable :: real_arr(:) + end type real_t + !$omp declare mapper(custommapper: real_t :: t) map(t%base_arr, t%real_arr) + + type(real_t) :: r + integer :: i + allocate(r%base_arr(10), source=1.0) + allocate(r%real_arr(10), source=1.0) + + !$omp target map(mapper(custommapper), tofrom: r) + do i = 1, size(r%base_arr) + r%base_arr(i) = 2.0 + r%real_arr(i) = 3.0 + r%real_arr(i) = r%base_arr(1) + end do + !$omp end target + + + !CHECK: base_arr: 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. + print*, "base_arr: ", r%base_arr + !CHECK: real_arr: 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. + print*, "real_arr: ", r%real_arr + + deallocate(r%real_arr) + deallocate(r%base_arr) +end program target_declare_mapper_parent_allocatable diff --git a/offload/test/offloading/fortran/target-no-loop.f90 b/offload/test/offloading/fortran/target-no-loop.f90 new file mode 100644 index 000000000000..3c88b00a5354 --- /dev/null +++ b/offload/test/offloading/fortran/target-no-loop.f90 @@ -0,0 +1,97 @@ +! REQUIRES: flang +! REQUIRES: gpu + +! RUN: %libomptarget-compile-fortran-generic -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription +! RUN: env LIBOMPTARGET_INFO=16 OMP_NUM_TEAMS=16 OMP_TEAMS_THREAD_LIMIT=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +function check_errors(array) result (errors) + integer, intent(in) :: array(1024) + integer :: errors + integer :: i + errors = 0 + do i = 1, 1024 + if ( array( i) .ne. (i) ) then + errors = errors + 1 + end if + end do +end function + +program main + use omp_lib + implicit none + integer :: i,j,red + integer :: array(1024), errors = 0 + array = 1 + + ! No-loop kernel + !$omp target teams distribute parallel do + do i = 1, 1024 + array(i) = i + end do + errors = errors + check_errors(array) + + ! SPMD kernel (num_teams clause blocks promotion to no-loop) + array = 1 + !$omp target teams distribute parallel do num_teams(3) + do i = 1, 1024 + array(i) = i + end do + + errors = errors + check_errors(array) + + ! No-loop kernel + array = 1 + !$omp target teams distribute parallel do num_threads(64) + do i = 1, 1024 + array(i) = i + end do + + errors = errors + check_errors(array) + + ! SPMD kernel + array = 1 + !$omp target parallel do + do i = 1, 1024 + array(i) = i + end do + + errors = errors + check_errors(array) + + ! Generic kernel + array = 1 + !$omp target teams distribute + do i = 1, 1024 + array(i) = i + end do + + errors = errors + check_errors(array) + + ! SPMD kernel (reduction clause blocks promotion to no-loop) + array = 1 + red =0 + !$omp target teams distribute parallel do reduction(+:red) + do i = 1, 1024 + red = red + array(i) + end do + + if (red .ne. 1024) then + errors = errors + 1 + end if + + print *,"number of errors: ", errors + +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode +! CHECK: info: #Args: 3 Teams x Thrds: 64x 16 +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode +! CHECK: info: #Args: 3 Teams x Thrds: 3x 16 {{.*}} +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode +! CHECK: info: #Args: 3 Teams x Thrds: 64x 16 {{.*}} +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode +! CHECK: info: #Args: 3 Teams x Thrds: 1x 16 +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} Generic mode +! CHECK: info: #Args: 3 Teams x Thrds: 16x 16 {{.*}} +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode +! CHECK: info: #Args: 4 Teams x Thrds: 16x 16 {{.*}} +! CHECK: number of errors: 0 + diff --git a/offload/test/offloading/interop.c b/offload/test/offloading/interop.c index 26287e3ec533..d9fa2ef883b9 100644 --- a/offload/test/offloading/interop.c +++ b/offload/test/offloading/interop.c @@ -1,5 +1,6 @@ // RUN: %libomptarget-compile-run-and-check-generic -// REQUIRES: nvptx64-nvidia-cuda + +// XFAIL: * #include <assert.h> #include <omp.h> diff --git a/offload/test/offloading/mandatory_but_no_devices.c b/offload/test/offloading/mandatory_but_no_devices.c index ecdee72acad0..df8a5f3b9278 100644 --- a/offload/test/offloading/mandatory_but_no_devices.c +++ b/offload/test/offloading/mandatory_but_no_devices.c @@ -3,6 +3,47 @@ // device. This behavior is proposed for OpenMP 5.2 in OpenMP spec github // issue 2669. +// AMD Tests +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR=target +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR='target teams' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -DDIR='target data map(X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \ +// RUN: -DDIR='target enter data map(to:X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \ +// RUN: -DDIR='target exit data map(from:X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \ +// RUN: -DDIR='target update to(X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa \ +// RUN: -DDIR='target update from(X)' +// RUN: env OMP_TARGET_OFFLOAD=mandatory ROCR_VISIBLE_DEVICES= \ +// RUN: %libomptarget-run-fail-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa + +// Nvidia Tests // RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DDIR=target // RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \ // RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \ @@ -42,8 +83,6 @@ // RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \ // RUN: %fcheck-nvptx64-nvidia-cuda -// REQUIRES: nvptx64-nvidia-cuda - #include <omp.h> #include <stdio.h> diff --git a/offload/test/offloading/memory_manager.cpp b/offload/test/offloading/memory_manager.cpp index fba1e4a54012..d6d8697fcdec 100644 --- a/offload/test/offloading/memory_manager.cpp +++ b/offload/test/offloading/memory_manager.cpp @@ -1,7 +1,5 @@ // RUN: %libomptarget-compilexx-run-and-check-generic -// REQUIRES: nvidiagpu - #include <omp.h> #include <cassert> diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_1.c b/offload/test/offloading/single_threaded_for_barrier_hang_1.c index 8ee6b51fb681..a007521a5c74 100644 --- a/offload/test/offloading/single_threaded_for_barrier_hang_1.c +++ b/offload/test/offloading/single_threaded_for_barrier_hang_1.c @@ -1,6 +1,9 @@ // RUN: %libomptarget-compile-run-and-check-generic // RUN: %libomptarget-compileopt-run-and-check-generic +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// UNSUPPORTED: gpu + #include <omp.h> #include <stdio.h> diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_2.c b/offload/test/offloading/single_threaded_for_barrier_hang_2.c index a98abd6922da..cabd2ed3dde7 100644 --- a/offload/test/offloading/single_threaded_for_barrier_hang_2.c +++ b/offload/test/offloading/single_threaded_for_barrier_hang_2.c @@ -1,6 +1,7 @@ // RUN: %libomptarget-compile-run-and-check-generic -// FIXME: This fails with optimization enabled and prints b: 0 -// FIXME: RUN: %libomptarget-compileopt-run-and-check-generic + +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// UNSUPPORTED: gpu #include <omp.h> #include <stdio.h> diff --git a/offload/test/offloading/spmdization.c b/offload/test/offloading/spmdization.c index 7f3f47d9ef32..48627cd7dae1 100644 --- a/offload/test/offloading/spmdization.c +++ b/offload/test/offloading/spmdization.c @@ -2,7 +2,8 @@ // RUN: %libomptarget-compileopt-generic // RUN: env LIBOMPTARGET_INFO=16 \ // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,SPMD -// RUN: %libomptarget-compileopt-generic -mllvm --openmp-opt-disable-spmdization +// RUN: %libomptarget-compileopt-generic -mllvm --openmp-opt-disable-spmdization \ +// RUN: -Xoffload-linker -mllvm=--openmp-opt-disable-spmdization // RUN: env LIBOMPTARGET_INFO=16 \ // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,GENERIC // clang-format on diff --git a/offload/test/offloading/strided_multiple_update.c b/offload/test/offloading/strided_multiple_update.c new file mode 100644 index 000000000000..a3e8d10863ae --- /dev/null +++ b/offload/test/offloading/strided_multiple_update.c @@ -0,0 +1,62 @@ +// This test checks that #pragma omp target update from(data1[0:3:4], +// data2[0:2:5]) correctly updates disjoint strided sections of multiple arrays +// from the device to the host. + +// RUN: %libomptarget-compile-run-and-check-generic +#include <omp.h> +#include <stdio.h> + +int main() { + int len = 12; + double data1[len], data2[len]; + +// Initial values +#pragma omp target map(tofrom : data1[0 : len], data2[0 : len]) + { + for (int i = 0; i < len; i++) { + data1[i] = i; + data2[i] = i * 10; + } + } + + printf("original host array values:\n"); + printf("data1: "); + for (int i = 0; i < len; i++) + printf("%.1f ", data1[i]); + printf("\ndata2: "); + for (int i = 0; i < len; i++) + printf("%.1f ", data2[i]); + printf("\n\n"); + +#pragma omp target data map(to : data1[0 : len], data2[0 : len]) + { +// Modify arrays on device +#pragma omp target + { + for (int i = 0; i < len; i++) + data1[i] += i; + for (int i = 0; i < len; i++) + data2[i] += 100; + } + +// data1[0:3:4] // indices 0,4,8 +// data2[0:2:5] // indices 0,5 +#pragma omp target update from(data1[0 : 3 : 4], data2[0 : 2 : 5]) + } + + printf("device array values after update from:\n"); + printf("data1: "); + for (int i = 0; i < len; i++) + printf("%.1f ", data1[i]); + printf("\ndata2: "); + for (int i = 0; i < len; i++) + printf("%.1f ", data2[i]); + printf("\n\n"); + + // CHECK: data1: 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 + // CHECK: data2: 0.0 10.0 20.0 30.0 40.0 50.0 60.0 70.0 80.0 90.0 100.0 110.0 + + // CHECK: data1: 0.0 1.0 2.0 3.0 8.0 5.0 6.0 7.0 16.0 9.0 10.0 11.0 + // CHECK: data2: 100.0 10.0 20.0 30.0 40.0 150.0 60.0 70.0 80.0 90.0 100.0 + // 110.0 +} diff --git a/offload/test/offloading/strided_partial_update.c b/offload/test/offloading/strided_partial_update.c new file mode 100644 index 000000000000..15d477f2b9b7 --- /dev/null +++ b/offload/test/offloading/strided_partial_update.c @@ -0,0 +1,63 @@ +// This test checks that #pragma omp target update from(data[0:4:3]) correctly +// updates every third element (stride 3) from the device to the host, partially +// across the array + +// RUN: %libomptarget-compile-run-and-check-generic +#include <omp.h> +#include <stdio.h> + +int main() { + int len = 11; + double data[len]; + +#pragma omp target map(tofrom : data[0 : len]) + { + for (int i = 0; i < len; i++) + data[i] = i; + } + + // Initial values + printf("original host array values:\n"); + for (int i = 0; i < len; i++) + printf("%f\n", data[i]); + printf("\n"); + +#pragma omp target data map(to : data[0 : len]) + { +// Modify arrays on device +#pragma omp target + for (int i = 0; i < len; i++) + data[i] += i; + +#pragma omp target update from(data[0 : 4 : 3]) // indices 0,3,6,9 + } + + printf("device array values after update from:\n"); + for (int i = 0; i < len; i++) + printf("%f\n", data[i]); + printf("\n"); + + // CHECK: 0.000000 + // CHECK: 1.000000 + // CHECK: 2.000000 + // CHECK: 3.000000 + // CHECK: 4.000000 + // CHECK: 5.000000 + // CHECK: 6.000000 + // CHECK: 7.000000 + // CHECK: 8.000000 + // CHECK: 9.000000 + // CHECK: 10.000000 + + // CHECK: 0.000000 + // CHECK: 1.000000 + // CHECK: 2.000000 + // CHECK: 6.000000 + // CHECK: 4.000000 + // CHECK: 5.000000 + // CHECK: 12.000000 + // CHECK: 7.000000 + // CHECK: 8.000000 + // CHECK: 18.000000 + // CHECK: 10.000000 +} diff --git a/offload/test/offloading/strided_update.c b/offload/test/offloading/strided_update.c new file mode 100644 index 000000000000..fe875b7fd55c --- /dev/null +++ b/offload/test/offloading/strided_update.c @@ -0,0 +1,54 @@ +// This test checks that "update from" clause in OpenMP is supported when the +// elements are updated in a non-contiguous manner. This test checks that +// #pragma omp target update from(data[0:4:2]) correctly updates only every +// other element (stride 2) from the device to the host + +// RUN: %libomptarget-compile-run-and-check-generic +#include <omp.h> +#include <stdio.h> + +int main() { + int len = 8; + double data[len]; +#pragma omp target map(tofrom : len, data[0 : len]) + { + for (int i = 0; i < len; i++) { + data[i] = i; + } + } + // Initial values + printf("original host array values:\n"); + for (int i = 0; i < len; i++) + printf("%f\n", data[i]); + printf("\n"); + +#pragma omp target data map(to : len, data[0 : len]) + { +// Modify arrays on device +#pragma omp target + for (int i = 0; i < len; i++) { + data[i] += i; + } + +#pragma omp target update from(data[0 : 4 : 2]) + } + // CHECK: 0.000000 + // CHECK: 1.000000 + // CHECK: 4.000000 + // CHECK: 3.000000 + // CHECK: 8.000000 + // CHECK: 5.000000 + // CHECK: 12.000000 + // CHECK: 7.000000 + // CHECK-NOT: 2.000000 + // CHECK-NOT: 6.000000 + // CHECK-NOT: 10.000000 + // CHECK-NOT: 14.000000 + + printf("from target array results:\n"); + for (int i = 0; i < len; i++) + printf("%f\n", data[i]); + printf("\n"); + + return 0; +} diff --git a/offload/test/sanitizer/ptr_outside_alloc_1.c b/offload/test/sanitizer/ptr_outside_alloc_1.c index bdd028352e40..b30ce12ef1ea 100644 --- a/offload/test/sanitizer/ptr_outside_alloc_1.c +++ b/offload/test/sanitizer/ptr_outside_alloc_1.c @@ -5,12 +5,10 @@ // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE // clang-format on -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: x86_64-unknown-linux-gnu -// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO -// UNSUPPORTED: s390x-ibm-linux-gnu -// UNSUPPORTED: s390x-ibm-linux-gnu-LTO +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// UNSUPPORTED: nvidiagpu +// +// REQUIRES: gpu #include <omp.h> diff --git a/offload/test/sanitizer/ptr_outside_alloc_2.c b/offload/test/sanitizer/ptr_outside_alloc_2.c index 6a67962f9eb3..3bb8bdaca8b4 100644 --- a/offload/test/sanitizer/ptr_outside_alloc_2.c +++ b/offload/test/sanitizer/ptr_outside_alloc_2.c @@ -3,12 +3,10 @@ // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK // clang-format on -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: x86_64-unknown-linux-gnu -// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO -// UNSUPPORTED: s390x-ibm-linux-gnu -// UNSUPPORTED: s390x-ibm-linux-gnu-LTO +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// UNSUPPORTED: nvidiagpu +// +// REQUIRES: gpu #include <omp.h> diff --git a/offload/test/sanitizer/use_after_free_1.c b/offload/test/sanitizer/use_after_free_1.c index c4783c5c36df..acc1de373f9e 100644 --- a/offload/test/sanitizer/use_after_free_1.c +++ b/offload/test/sanitizer/use_after_free_1.c @@ -5,12 +5,10 @@ // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE // clang-format on -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: x86_64-unknown-linux-gnu -// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO -// UNSUPPORTED: s390x-ibm-linux-gnu -// UNSUPPORTED: s390x-ibm-linux-gnu-LTO +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// UNSUPPORTED: nvidiagpu +// +// REQUIRES: gpu #include <omp.h> diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c index 1c1e09744a75..3d70fb7b3a3f 100644 --- a/offload/test/sanitizer/use_after_free_2.c +++ b/offload/test/sanitizer/use_after_free_2.c @@ -3,12 +3,10 @@ // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK // clang-format on -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: x86_64-unknown-linux-gnu -// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO -// UNSUPPORTED: s390x-ibm-linux-gnu -// UNSUPPORTED: s390x-ibm-linux-gnu-LTO +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// UNSUPPORTED: nvidiagpu +// +// REQUIRES: gpu // If offload memory pooling is enabled for a large allocation, reuse error is // not detected. UNSUPPORTED: large_allocation_memory_pool diff --git a/offload/test/tools/llvm-omp-device-info.c b/offload/test/tools/llvm-omp-device-info.c index 6f497309df2f..1ce8d4ac07f6 100644 --- a/offload/test/tools/llvm-omp-device-info.c +++ b/offload/test/tools/llvm-omp-device-info.c @@ -2,5 +2,5 @@ // // Just check any device was found and something is printed // -// CHECK: Found {{[1-9].*}} devices: -// CHECK: Device 0: +// CHECK: Num Devices: {{[1-9].*}} +// CHECK: [{{[1-9A-Za-z].*}}] diff --git a/offload/test/tools/offload-tblgen/default_returns.td b/offload/test/tools/offload-tblgen/default_returns.td index e919492cc5bf..41949db7226a 100644 --- a/offload/test/tools/offload-tblgen/default_returns.td +++ b/offload/test/tools/offload-tblgen/default_returns.td @@ -6,13 +6,11 @@ include "APIDefs.td" -def : Handle { - let name = "ol_foo_handle_t"; +def ol_foo_handle_t : Handle { let desc = "Example handle type"; } -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/entry_points.td b/offload/test/tools/offload-tblgen/entry_points.td index c66d5b488b46..94ea820d453e 100644 --- a/offload/test/tools/offload-tblgen/entry_points.td +++ b/offload/test/tools/offload-tblgen/entry_points.td @@ -4,8 +4,7 @@ include "APIDefs.td" -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/functions_basic.td b/offload/test/tools/offload-tblgen/functions_basic.td index dec93577b57e..2802c78a2947 100644 --- a/offload/test/tools/offload-tblgen/functions_basic.td +++ b/offload/test/tools/offload-tblgen/functions_basic.td @@ -6,8 +6,7 @@ include "APIDefs.td" -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/functions_code_loc.td b/offload/test/tools/offload-tblgen/functions_code_loc.td index aec20129343f..8d7aa00c5f15 100644 --- a/offload/test/tools/offload-tblgen/functions_code_loc.td +++ b/offload/test/tools/offload-tblgen/functions_code_loc.td @@ -7,8 +7,7 @@ include "APIDefs.td" -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/functions_ranged_param.td b/offload/test/tools/offload-tblgen/functions_ranged_param.td index d0996b231973..1ce8b394b157 100644 --- a/offload/test/tools/offload-tblgen/functions_ranged_param.td +++ b/offload/test/tools/offload-tblgen/functions_ranged_param.td @@ -8,13 +8,11 @@ include "APIDefs.td" -def : Handle { - let name = "some_handle_t"; +def some_handle_t : Handle { let desc = "An example handle type"; } -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/print_enum.td b/offload/test/tools/offload-tblgen/print_enum.td index 97f869689293..c7573a9a415c 100644 --- a/offload/test/tools/offload-tblgen/print_enum.td +++ b/offload/test/tools/offload-tblgen/print_enum.td @@ -4,8 +4,7 @@ include "APIDefs.td" -def : Enum { - let name = "my_enum_t"; +def my_enum_t : Enum { let desc = "An example enum"; let etors =[ Etor<"VALUE_ONE", "The first enum value">, diff --git a/offload/test/tools/offload-tblgen/print_function.td b/offload/test/tools/offload-tblgen/print_function.td index ce1fe4c52760..74b39f145a40 100644 --- a/offload/test/tools/offload-tblgen/print_function.td +++ b/offload/test/tools/offload-tblgen/print_function.td @@ -5,13 +5,11 @@ include "APIDefs.td" -def : Handle { - let name = "ol_foo_handle_t"; +def ol_foo_handle_t : Handle { let desc = "Example handle type"; } -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/type_tagged_enum.td b/offload/test/tools/offload-tblgen/type_tagged_enum.td index 95964e32f0c9..b32531aac9c8 100644 --- a/offload/test/tools/offload-tblgen/type_tagged_enum.td +++ b/offload/test/tools/offload-tblgen/type_tagged_enum.td @@ -9,13 +9,11 @@ include "APIDefs.td" -def : Handle { - let name = "some_handle_t"; +def some_handle_t: Handle { let desc = "An example handle type"; } -def : Enum { - let name = "my_type_tagged_enum_t"; +def my_type_tagged_enum_t : Enum { let desc = "Example type tagged enum"; let is_typed = 1; let etors = [ @@ -34,8 +32,7 @@ def : Enum { // CHECK-API-NEXT: [some_handle_t] Value three. // CHECK-API-NEXT: MY_TYPE_TAGGED_ENUM_VALUE_THREE = 2, -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/tools/deviceinfo/CMakeLists.txt b/offload/tools/deviceinfo/CMakeLists.txt index 3787c12f940a..cc2d0a6add8b 100644 --- a/offload/tools/deviceinfo/CMakeLists.txt +++ b/offload/tools/deviceinfo/CMakeLists.txt @@ -4,10 +4,6 @@ add_openmp_tool(llvm-offload-device-info llvm-offload-device-info.cpp) llvm_update_compile_flags(llvm-offload-device-info) -target_include_directories(llvm-offload-device-info PRIVATE - ${LIBOMPTARGET_INCLUDE_DIR} -) target_link_libraries(llvm-offload-device-info PRIVATE - omp - omptarget + LLVMOffload ) diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp index 2228fbf3ec17..9b58d67f017c 100644 --- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp +++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp @@ -1,4 +1,4 @@ -//===- llvm-offload-device-info.cpp - Device info as seen by LLVM/Offload -===// +//===- llvm-offload-device-info.cpp - Print liboffload properties ---------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,27 +6,272 @@ // //===----------------------------------------------------------------------===// // -// This is a command line utility that, by using LLVM/Offload, and the device -// plugins, list devices information as seen by the runtime. +// This is a command line utility that, by using the new liboffload API, prints +// all devices and properties // //===----------------------------------------------------------------------===// -#include "omptarget.h" -#include <cstdio> +#include <OffloadAPI.h> +#include <iostream> +#include <vector> -int main(int argc, char **argv) { - __tgt_bin_desc EmptyDesc = {0, nullptr, nullptr, nullptr}; - __tgt_register_lib(&EmptyDesc); - __tgt_init_all_rtls(); +#define OFFLOAD_ERR(X) \ + if (auto Err = X) { \ + return Err; \ + } + +enum class PrintKind { + NORMAL, + FP_FLAGS, +}; + +template <typename T, PrintKind PK = PrintKind::NORMAL> +void doWrite(std::ostream &S, T &&Val) { + S << Val; +} + +template <> +void doWrite<ol_platform_backend_t>(std::ostream &S, + ol_platform_backend_t &&Val) { + switch (Val) { + case OL_PLATFORM_BACKEND_UNKNOWN: + S << "UNKNOWN"; + break; + case OL_PLATFORM_BACKEND_CUDA: + S << "CUDA"; + break; + case OL_PLATFORM_BACKEND_AMDGPU: + S << "AMDGPU"; + break; + case OL_PLATFORM_BACKEND_HOST: + S << "HOST"; + break; + default: + S << "<< INVALID >>"; + break; + } +} +template <> +void doWrite<ol_device_type_t>(std::ostream &S, ol_device_type_t &&Val) { + switch (Val) { + case OL_DEVICE_TYPE_GPU: + S << "GPU"; + break; + case OL_DEVICE_TYPE_CPU: + S << "CPU"; + break; + case OL_DEVICE_TYPE_HOST: + S << "HOST"; + break; + default: + S << "<< INVALID >>"; + break; + } +} +template <> +void doWrite<ol_dimensions_t>(std::ostream &S, ol_dimensions_t &&Val) { + S << "{x: " << Val.x << ", y: " << Val.y << ", z: " << Val.z << "}"; +} +template <> +void doWrite<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>( + std::ostream &S, ol_device_fp_capability_flags_t &&Val) { + S << Val << " {"; + + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT) { + S << " CORRECTLY_ROUNDED_DIVIDE_SQRT"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST) { + S << " ROUND_TO_NEAREST"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO) { + S << " ROUND_TO_ZERO"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF) { + S << " ROUND_TO_INF"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN) { + S << " INF_NAN"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_DENORM) { + S << " DENORM"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_FMA) { + S << " FMA"; + } + if (Val & OL_DEVICE_FP_CAPABILITY_FLAG_SOFT_FLOAT) { + S << " SOFT_FLOAT"; + } + + S << " }"; +} - printf("Found %d devices:\n", omp_get_num_devices()); - for (int Dev = 0; Dev < omp_get_num_devices(); Dev++) { - printf(" Device %d:\n", Dev); - if (!__tgt_print_device_info(Dev)) - printf(" print_device_info not implemented\n"); - printf("\n"); +template <typename T> +ol_result_t printPlatformValue(std::ostream &S, ol_platform_handle_t Plat, + ol_platform_info_t Info, const char *Desc) { + S << Desc << ": "; + + if constexpr (std::is_pointer_v<T>) { + std::vector<uint8_t> Val; + size_t Size; + OFFLOAD_ERR(olGetPlatformInfoSize(Plat, Info, &Size)); + Val.resize(Size); + OFFLOAD_ERR(olGetPlatformInfo(Plat, Info, sizeof(Val), Val.data())); + doWrite(S, reinterpret_cast<T>(Val.data())); + } else { + T Val; + OFFLOAD_ERR(olGetPlatformInfo(Plat, Info, sizeof(Val), &Val)); + doWrite(S, std::move(Val)); + } + S << "\n"; + return OL_SUCCESS; +} + +template <typename T, PrintKind PK = PrintKind::NORMAL> +ol_result_t printDeviceValue(std::ostream &S, ol_device_handle_t Dev, + ol_device_info_t Info, const char *Desc, + const char *Units = nullptr) { + S << Desc << ": "; + + if constexpr (std::is_pointer_v<T>) { + std::vector<uint8_t> Val; + size_t Size; + OFFLOAD_ERR(olGetDeviceInfoSize(Dev, Info, &Size)); + Val.resize(Size); + OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, Size, Val.data())); + doWrite<T, PK>(S, reinterpret_cast<T>(Val.data())); + } else { + T Val; + OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, sizeof(Val), &Val)); + doWrite<T, PK>(S, std::move(Val)); + } + if (Units) + S << " " << Units; + S << "\n"; + return OL_SUCCESS; +} + +ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) { + ol_platform_handle_t Platform; + OFFLOAD_ERR( + olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), &Platform)); + + std::vector<char> Name; + size_t NameSize; + OFFLOAD_ERR(olGetDeviceInfoSize(D, OL_DEVICE_INFO_PRODUCT_NAME, &NameSize)) + Name.resize(NameSize); + OFFLOAD_ERR( + olGetDeviceInfo(D, OL_DEVICE_INFO_PRODUCT_NAME, NameSize, Name.data())); + S << "[" << Name.data() << "]\n"; + + OFFLOAD_ERR(printPlatformValue<const char *>( + S, Platform, OL_PLATFORM_INFO_NAME, "Platform Name")); + OFFLOAD_ERR(printPlatformValue<const char *>( + S, Platform, OL_PLATFORM_INFO_VENDOR_NAME, "Platform Vendor Name")); + OFFLOAD_ERR(printPlatformValue<const char *>( + S, Platform, OL_PLATFORM_INFO_VERSION, "Platform Version")); + OFFLOAD_ERR(printPlatformValue<ol_platform_backend_t>( + S, Platform, OL_PLATFORM_INFO_BACKEND, "Platform Backend")); + + OFFLOAD_ERR( + printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_NAME, "Name")); + OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_PRODUCT_NAME, + "Product Name")); + OFFLOAD_ERR( + printDeviceValue<ol_device_type_t>(S, D, OL_DEVICE_INFO_TYPE, "Type")); + OFFLOAD_ERR(printDeviceValue<const char *>( + S, D, OL_DEVICE_INFO_DRIVER_VERSION, "Driver Version")); + OFFLOAD_ERR(printDeviceValue<uint32_t>( + S, D, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, "Max Work Group Size")); + OFFLOAD_ERR(printDeviceValue<ol_dimensions_t>( + S, D, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, + "Max Work Group Size Per Dimension")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_MAX_WORK_SIZE, + "Max Work Size")); + OFFLOAD_ERR(printDeviceValue<ol_dimensions_t>( + S, D, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, + "Max Work Size Per Dimension")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_VENDOR_ID, "Vendor ID")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NUM_COMPUTE_UNITS, + "Num Compute Units")); + OFFLOAD_ERR(printDeviceValue<uint32_t>( + S, D, OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY, "Max Clock Frequency", "MHz")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_MEMORY_CLOCK_RATE, + "Memory Clock Rate", "MHz")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_ADDRESS_BITS, + "Address Bits")); + OFFLOAD_ERR(printDeviceValue<uint64_t>( + S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B")); + OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE, + "Global Mem Size", "B")); + OFFLOAD_ERR( + (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>( + S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG, + "Single Precision Floating Point Capability"))); + OFFLOAD_ERR( + (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>( + S, D, OL_DEVICE_INFO_DOUBLE_FP_CONFIG, + "Double Precision Floating Point Capability"))); + OFFLOAD_ERR( + (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>( + S, D, OL_DEVICE_INFO_HALF_FP_CONFIG, + "Half Precision Floating Point Capability"))); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR, + "Native Vector Width For Char")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT, + "Native Vector Width For Short")); + OFFLOAD_ERR(printDeviceValue<uint32_t>(S, D, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT, + "Native Vector Width For Int")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG, + "Native Vector Width For Long")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT, + "Native Vector Width For Float")); + OFFLOAD_ERR(printDeviceValue<uint32_t>( + S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE, + "Native Vector Width For Double")); + OFFLOAD_ERR( + printDeviceValue<uint32_t>(S, D, OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF, + "Native Vector Width For Half")); + + return OL_SUCCESS; +} + +ol_result_t printRoot(std::ostream &S) { + OFFLOAD_ERR(olInit()); + S << "Liboffload Version: " << OL_VERSION_MAJOR << "." << OL_VERSION_MINOR + << "." << OL_VERSION_PATCH << "\n"; + + std::vector<ol_device_handle_t> Devices; + OFFLOAD_ERR(olIterateDevices( + [](ol_device_handle_t Device, void *UserData) { + reinterpret_cast<decltype(Devices) *>(UserData)->push_back(Device); + return true; + }, + &Devices)); + + S << "Num Devices: " << Devices.size() << "\n"; + + for (auto &D : Devices) { + S << "\n"; + OFFLOAD_ERR(printDevice(S, D)); } - __tgt_unregister_lib(&EmptyDesc); + OFFLOAD_ERR(olShutDown()); + return OL_SUCCESS; +} + +int main(int argc, char **argv) { + auto Err = printRoot(std::cout); + + if (Err) { + std::cerr << "[Liboffload error " << Err->Code << "]: " << Err->Details + << "\n"; + return 1; + } return 0; } diff --git a/offload/tools/offload-tblgen/APIGen.cpp b/offload/tools/offload-tblgen/APIGen.cpp index 8c61d1f12de7..1e79c00ae06c 100644 --- a/offload/tools/offload-tblgen/APIGen.cpp +++ b/offload/tools/offload-tblgen/APIGen.cpp @@ -131,7 +131,8 @@ static void ProcessEnum(const EnumRec &Enum, raw_ostream &OS) { OS << formatv("/// @brief {0}\n", Enum.getDesc()); OS << formatv("typedef enum {0} {{\n", Enum.getName()); - uint32_t EtorVal = 0; + // Bitfields start from 1, other enums from 0 + uint32_t EtorVal = Enum.isBitField(); for (const auto &EnumVal : Enum.getValues()) { if (Enum.isTyped()) { OS << MakeComment( @@ -141,7 +142,12 @@ static void ProcessEnum(const EnumRec &Enum, raw_ostream &OS) { OS << MakeComment(EnumVal.getDesc()); } OS << formatv(TAB_1 "{0}_{1} = {2},\n", Enum.getEnumValNamePrefix(), - EnumVal.getName(), EtorVal++); + EnumVal.getName(), EtorVal); + if (Enum.isBitField()) { + EtorVal <<= 1u; + } else { + ++EtorVal; + } } // Add last_element/force uint32 val @@ -220,31 +226,23 @@ OL_APIEXPORT ol_result_t OL_APICALL {0}WithCodeLoc( void EmitOffloadAPI(const RecordKeeper &Records, raw_ostream &OS) { OS << GenericHeader; OS << FileHeader; - // Generate main API definitions - for (auto *R : Records.getAllDerivedDefinitions("APIObject")) { - if (R->isSubClassOf("Macro")) { - ProcessMacro(MacroRec{R}, OS); - } else if (R->isSubClassOf("Typedef")) { - ProcessTypedef(TypedefRec{R}, OS); - } else if (R->isSubClassOf("Handle")) { - ProcessHandle(HandleRec{R}, OS); - } else if (R->isSubClassOf("Function")) { - ProcessFunction(FunctionRec{R}, OS); - } else if (R->isSubClassOf("Enum")) { - ProcessEnum(EnumRec{R}, OS); - } else if (R->isSubClassOf("Struct")) { - ProcessStruct(StructRec{R}, OS); - } else if (R->isSubClassOf("FptrTypedef")) { - ProcessFptrTypedef(FptrTypedefRec{R}, OS); - } - } - // Generate auxiliary definitions (func param structs etc) + // Generate main API definitions + for (auto *R : Records.getAllDerivedDefinitions("Macro")) + ProcessMacro(MacroRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("Handle")) + ProcessHandle(HandleRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("Enum")) + ProcessEnum(EnumRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("Typedef")) + ProcessTypedef(TypedefRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("FptrTypedef")) + ProcessFptrTypedef(FptrTypedefRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("Struct")) + ProcessStruct(StructRec{R}, OS); for (auto *R : Records.getAllDerivedDefinitions("Function")) { ProcessFuncParamStruct(FunctionRec{R}, OS); - } - - for (auto *R : Records.getAllDerivedDefinitions("Function")) { + ProcessFunction(FunctionRec{R}, OS); ProcessFuncWithCodeLocVariant(FunctionRec{R}, OS); } diff --git a/offload/tools/offload-tblgen/MiscGen.cpp b/offload/tools/offload-tblgen/MiscGen.cpp index b90e5cfdec8b..8a8b9caf2348 100644 --- a/offload/tools/offload-tblgen/MiscGen.cpp +++ b/offload/tools/offload-tblgen/MiscGen.cpp @@ -86,7 +86,7 @@ void EmitOffloadErrcodes(const RecordKeeper &Records, raw_ostream &OS) { )"; - auto ErrorCodeEnum = EnumRec{Records.getDef("ErrorCode")}; + auto ErrorCodeEnum = EnumRec{Records.getDef("ol_errc_t")}; uint32_t EtorVal = 0; for (const auto &EnumVal : ErrorCodeEnum.getValues()) { OS << formatv(TAB_1 "OFFLOAD_ERRC({0}, \"{1}\", {2})\n", EnumVal.getName(), @@ -107,10 +107,16 @@ void EmitOffloadInfo(const RecordKeeper &Records, raw_ostream &OS) { )"; - auto ErrorCodeEnum = EnumRec{Records.getDef("DeviceInfo")}; - uint32_t EtorVal = 0; - for (const auto &EnumVal : ErrorCodeEnum.getValues()) { + auto Enum = EnumRec{Records.getDef("ol_device_info_t")}; + // Bitfields start from 1, other enums from 0 + uint32_t EtorVal = Enum.isBitField(); + for (const auto &EnumVal : Enum.getValues()) { OS << formatv(TAB_1 "OFFLOAD_DEVINFO({0}, \"{1}\", {2})\n", - EnumVal.getName(), EnumVal.getDesc(), EtorVal++); + EnumVal.getName(), EnumVal.getDesc(), EtorVal); + if (Enum.isBitField()) { + EtorVal <<= 1u; + } else { + ++EtorVal; + } } } diff --git a/offload/tools/offload-tblgen/RecordTypes.hpp b/offload/tools/offload-tblgen/RecordTypes.hpp index 65c0a4ce4a2c..2abd9e10f0f9 100644 --- a/offload/tools/offload-tblgen/RecordTypes.hpp +++ b/offload/tools/offload-tblgen/RecordTypes.hpp @@ -16,25 +16,30 @@ namespace llvm { namespace offload { namespace tblgen { -class HandleRec { +class APIObject { public: - explicit HandleRec(const Record *rec) : rec(rec) {} - StringRef getName() const { return rec->getValueAsString("name"); } + StringRef getName() const { return rec->getName(); } StringRef getDesc() const { return rec->getValueAsString("desc"); } -private: +protected: + APIObject(const Record *rec) : rec(rec) {} const Record *rec; }; -class MacroRec { +class HandleRec : public APIObject { public: - explicit MacroRec(const Record *rec) : rec(rec) { - auto Name = rec->getValueAsString("name"); + explicit HandleRec(const Record *rec) : APIObject(rec) {}; +}; + +class MacroRec : public APIObject { +public: + explicit MacroRec(const Record *rec) : APIObject(rec) { + auto Name = rec->getName(); auto OpenBrace = Name.find_first_of("("); nameWithoutArgs = Name.substr(0, OpenBrace); } StringRef getName() const { return nameWithoutArgs; } - StringRef getNameWithArgs() const { return rec->getValueAsString("name"); } + StringRef getNameWithArgs() const { return rec->getName(); } StringRef getDesc() const { return rec->getValueAsString("desc"); } std::optional<StringRef> getCondition() const { @@ -46,19 +51,15 @@ public: } private: - const Record *rec; std::string nameWithoutArgs; }; -class TypedefRec { +class TypedefRec : public APIObject { public: - explicit TypedefRec(const Record *rec) : rec(rec) {} - StringRef getName() const { return rec->getValueAsString("name"); } - StringRef getDesc() const { return rec->getValueAsString("desc"); } - StringRef getValue() const { return rec->getValueAsString("value"); } + explicit TypedefRec(const Record *rec) : APIObject(rec) {}; -private: - const Record *rec; +public: + StringRef getValue() const { return rec->getValueAsString("value"); } }; class EnumValueRec { @@ -74,15 +75,13 @@ private: const Record *rec; }; -class EnumRec { +class EnumRec : public APIObject { public: - explicit EnumRec(const Record *rec) : rec(rec) { + explicit EnumRec(const Record *rec) : APIObject(rec) { for (const auto *Val : rec->getValueAsListOfDefs("etors")) { vals.emplace_back(EnumValueRec{Val}); } } - StringRef getName() const { return rec->getValueAsString("name"); } - StringRef getDesc() const { return rec->getValueAsString("desc"); } const std::vector<EnumValueRec> &getValues() const { return vals; } std::string getEnumValNamePrefix() const { @@ -92,8 +91,9 @@ public: bool isTyped() const { return rec->getValueAsBit("is_typed"); } + bool isBitField() const { return rec->getValueAsBit("is_bit_field"); } + private: - const Record *rec; std::vector<EnumValueRec> vals; }; @@ -110,22 +110,19 @@ private: const Record *rec; }; -class StructRec { +class StructRec : public APIObject { public: - explicit StructRec(const Record *rec) : rec(rec) { + explicit StructRec(const Record *rec) : APIObject(rec) { for (auto *Member : rec->getValueAsListOfDefs("all_members")) { members.emplace_back(StructMemberRec(Member)); } } - StringRef getName() const { return rec->getValueAsString("name"); } - StringRef getDesc() const { return rec->getValueAsString("desc"); } std::optional<StringRef> getBaseClass() const { return rec->getValueAsOptionalString("base_class"); } const std::vector<StructMemberRec> &getMembers() const { return members; } private: - const Record *rec; std::vector<StructMemberRec> members; }; @@ -205,9 +202,9 @@ private: const Record *rec; }; -class FunctionRec { +class FunctionRec : public APIObject { public: - FunctionRec(const Record *rec) : rec(rec) { + FunctionRec(const Record *rec) : APIObject(rec) { for (auto &Ret : rec->getValueAsListOfDefs("all_returns")) rets.emplace_back(Ret); for (auto &Param : rec->getValueAsListOfDefs("params")) @@ -219,11 +216,9 @@ public: llvm::convertToSnakeFromCamelCase(getName())); } - StringRef getName() const { return rec->getValueAsString("name"); } StringRef getClass() const { return rec->getValueAsString("api_class"); } const std::vector<ReturnRec> &getReturns() const { return rets; } const std::vector<ParamRec> &getParams() const { return params; } - StringRef getDesc() const { return rec->getValueAsString("desc"); } std::vector<StringRef> getDetails() const { return rec->getValueAsListOfStrings("details"); } @@ -234,25 +229,19 @@ public: private: std::vector<ReturnRec> rets; std::vector<ParamRec> params; - - const Record *rec; }; -class FptrTypedefRec { +class FptrTypedefRec : public APIObject { public: - explicit FptrTypedefRec(const Record *rec) : rec(rec) { + explicit FptrTypedefRec(const Record *rec) : APIObject(rec) { for (auto &Param : rec->getValueAsListOfDefs("params")) params.emplace_back(Param); } - StringRef getName() const { return rec->getValueAsString("name"); } - StringRef getDesc() const { return rec->getValueAsString("desc"); } StringRef getReturn() const { return rec->getValueAsString("return"); } const std::vector<ParamRec> &getParams() const { return params; } private: std::vector<ParamRec> params; - - const Record *rec; }; } // namespace tblgen diff --git a/offload/unittests/Conformance/README.md b/offload/unittests/Conformance/README.md new file mode 100644 index 000000000000..0202242c99a0 --- /dev/null +++ b/offload/unittests/Conformance/README.md @@ -0,0 +1,83 @@ +# GPU Math Conformance Tests + +## Overview + +This test suite provides a framework to systematically measure the accuracy of math functions on GPUs and verify their conformance with standards like OpenCL. + +While the primary focus is validating the implementations in the C standard math library (LLVM-libm), these tests can also be executed against other math library providers, such as CUDA Math and HIP Math, for comparison. + +The goals of this project are to empower LLVM-libm contributors with a robust tool for validating their implementations and to build trust with end-users by providing transparent accuracy data. + +### Table of Contents + +- [Getting Started](#getting-started) +- [Running the Tests](#running-the-tests) +- [Adding New Tests](#adding-new-tests) + +## Getting Started + +This guide covers how to build the necessary dependencies, which include the new Offload API and the C standard library for both host and GPU targets. + +### System Requirements + +Before you begin, ensure your system meets the following requirements: + +- A system with an AMD or NVIDIA GPU. +- The latest proprietary GPU drivers installed. +- The corresponding development SDK for your hardware: + - **AMD:** [ROCm SDK](https://rocm.docs.amd.com) + - **NVIDIA:** [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) + +### Building the Dependencies + +The official documentation for building LLVM-libc for GPUs provides a detailed guide and should be considered the primary reference. Please follow the instructions in the **"Standard runtimes build"** section of that guide: + +- [Building the GPU C library (Official Documentation)](https://libc.llvm.org/gpu/building.html) + +> [!IMPORTANT] +> For the conformance tests, the standard `cmake` command from the official documentation must be adapted slightly. You must also add `libc` to the main `-DLLVM_ENABLE_RUNTIMES` list. This is a crucial step because the tests need a host-side build of `libc` to use as the reference oracle for validating GPU results. + +## Running the Tests + +### Default Test + +To build and run the conformance test for a given function (e.g., `logf`) against the default C standard math library `llvm-libm` provider, use the following command. This will execute the test on all available and supported platforms. + +```bash +ninja -C build/runtimes/runtimes-bins offload.conformance.logf +``` + +### Testing Other Providers + +Once the test binary has been built, you can run it against other math library providers using the `--test-configs` flag. + +- **For `cuda-math` on an NVIDIA GPU:** + + ```bash + ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=cuda-math:cuda + ``` + +- **For `hip-math` on an AMD GPU:** + + ```bash + ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=hip-math:amdgpu + ``` + +You can also run all available configurations for a test with: + +```bash +./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=all +``` + +## Adding New Tests + +To add a conformance test for a new math function, follow these steps: + +1. **Implement the Device Kernels**: Create a kernel wrapper for the new function in each provider's source file. For CUDA Math and HIP Math, you must also add a forward declaration for the vendor function in `/device_code/DeviceAPIs.hpp`. + +2. **Implement the Host Test**: Create a new `.cpp` file in `/tests`. This file defines the `FunctionConfig` (function and kernel names, as well as ULP tolerance) and the input generation strategy. + + - Use **exhaustive testing** (`ExhaustiveGenerator`) for functions with small input spaces (e.g., half-precision functions and single-precision univariate functions). This strategy iterates over every representable point in the input space, ensuring complete coverage. + - Use **randomized testing** (`RandomGenerator`) for functions with large input spaces (e.g., single-precision bivariate and double-precision functions), where exhaustive testing is computationally infeasible. Although not exhaustive, this strategy is deterministic, using a fixed seed to sample a large, reproducible subset of points from the input space. + +3. **Add the Build Target**: Add a new `add_conformance_test(...)` entry to `/tests/CMakeLists.txt` to make the test buildable. diff --git a/offload/unittests/Conformance/device_code/CUDAMath.cpp b/offload/unittests/Conformance/device_code/CUDAMath.cpp index a351e924b8f8..d80660b2e3c7 100644 --- a/offload/unittests/Conformance/device_code/CUDAMath.cpp +++ b/offload/unittests/Conformance/device_code/CUDAMath.cpp @@ -26,6 +26,22 @@ using namespace kernels; // Helpers //===----------------------------------------------------------------------===// +static inline float powfRoundedExponent(float Base, float Exponent) { + return __nv_powf(Base, __nv_roundf(Exponent)); +} + +static inline double sincosSin(double X) { + double SinX, CosX; + __nv_sincos(X, &SinX, &CosX); + return SinX; +} + +static inline double sincosCos(double X) { + double SinX, CosX; + __nv_sincos(X, &SinX, &CosX); + return CosX; +} + static inline float sincosfSin(float X) { float SinX, CosX; __nv_sincosf(X, &SinX, &CosX); @@ -44,6 +60,11 @@ static inline float sincosfCos(float X) { extern "C" { +__gpu_kernel void acosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_acos>(NumElements, Out, X); +} + __gpu_kernel void acosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_acosf>(NumElements, Out, X); @@ -54,6 +75,11 @@ __gpu_kernel void acoshfKernel(const float *X, float *Out, runKernelBody<__nv_acoshf>(NumElements, Out, X); } +__gpu_kernel void asinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_asin>(NumElements, Out, X); +} + __gpu_kernel void asinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_asinf>(NumElements, Out, X); @@ -69,16 +95,31 @@ __gpu_kernel void atanfKernel(const float *X, float *Out, runKernelBody<__nv_atanf>(NumElements, Out, X); } +__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_atan2f>(NumElements, Out, X, Y); +} + __gpu_kernel void atanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_atanhf>(NumElements, Out, X); } +__gpu_kernel void cbrtKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_cbrt>(NumElements, Out, X); +} + __gpu_kernel void cbrtfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_cbrtf>(NumElements, Out, X); } +__gpu_kernel void cosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_cos>(NumElements, Out, X); +} + __gpu_kernel void cosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_cosf>(NumElements, Out, X); @@ -99,51 +140,127 @@ __gpu_kernel void erffKernel(const float *X, float *Out, runKernelBody<__nv_erff>(NumElements, Out, X); } +__gpu_kernel void expKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_exp>(NumElements, Out, X); +} + __gpu_kernel void expfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_expf>(NumElements, Out, X); } +__gpu_kernel void exp10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_exp10>(NumElements, Out, X); +} + __gpu_kernel void exp10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_exp10f>(NumElements, Out, X); } +__gpu_kernel void exp2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_exp2>(NumElements, Out, X); +} + __gpu_kernel void exp2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_exp2f>(NumElements, Out, X); } +__gpu_kernel void expm1Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_expm1>(NumElements, Out, X); +} + __gpu_kernel void expm1fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_expm1f>(NumElements, Out, X); } +__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_hypot>(NumElements, Out, X, Y); +} + +__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_hypotf>(NumElements, Out, X, Y); +} + +__gpu_kernel void logKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log>(NumElements, Out, X); +} + __gpu_kernel void logfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_logf>(NumElements, Out, X); } +__gpu_kernel void log10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log10>(NumElements, Out, X); +} + __gpu_kernel void log10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_log10f>(NumElements, Out, X); } +__gpu_kernel void log1pKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log1p>(NumElements, Out, X); +} + __gpu_kernel void log1pfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_log1pf>(NumElements, Out, X); } +__gpu_kernel void log2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log2>(NumElements, Out, X); +} + __gpu_kernel void log2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_log2f>(NumElements, Out, X); } +__gpu_kernel void powfKernel(const float *X, float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_powf>(NumElements, Out, X, Y); +} + +__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y, + float *Out, + size_t NumElements) noexcept { + runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y); +} + +__gpu_kernel void sinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_sin>(NumElements, Out, X); +} + __gpu_kernel void sinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_sinf>(NumElements, Out, X); } +__gpu_kernel void sincosSinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosSin>(NumElements, Out, X); +} + +__gpu_kernel void sincosCosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosCos>(NumElements, Out, X); +} + __gpu_kernel void sincosfSinKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sincosfSin>(NumElements, Out, X); @@ -164,6 +281,11 @@ __gpu_kernel void sinpifKernel(const float *X, float *Out, runKernelBody<__nv_sinpif>(NumElements, Out, X); } +__gpu_kernel void tanKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_tan>(NumElements, Out, X); +} + __gpu_kernel void tanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_tanf>(NumElements, Out, X); diff --git a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp index 8476dcbeff0c..894652a8e1af 100644 --- a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp +++ b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp @@ -48,29 +48,49 @@ extern const inline uint32_t __oclc_ISA_version = 9000; extern "C" { +double __nv_acos(double); float __nv_acosf(float); float __nv_acoshf(float); +double __nv_asin(double); float __nv_asinf(float); float __nv_asinhf(float); float __nv_atanf(float); +float __nv_atan2f(float, float); float __nv_atanhf(float); +double __nv_cbrt(double); float __nv_cbrtf(float); +double __nv_cos(double); float __nv_cosf(float); float __nv_coshf(float); float __nv_cospif(float); float __nv_erff(float); +double __nv_exp(double); float __nv_expf(float); +double __nv_exp10(double); float __nv_exp10f(float); +double __nv_exp2(double); float __nv_exp2f(float); +double __nv_expm1(double); float __nv_expm1f(float); +double __nv_hypot(double, double); +float __nv_hypotf(float, float); +double __nv_log(double); float __nv_logf(float); +double __nv_log10(double); float __nv_log10f(float); +double __nv_log1p(double); float __nv_log1pf(float); +double __nv_log2(double); float __nv_log2f(float); +float __nv_powf(float, float); +float __nv_roundf(float); +double __nv_sin(double); float __nv_sinf(float); +void __nv_sincos(double, double *, double *); void __nv_sincosf(float, float *, float *); float __nv_sinhf(float); float __nv_sinpif(float); +double __nv_tan(double); float __nv_tanf(float); float __nv_tanhf(float); } // extern "C" @@ -81,31 +101,70 @@ float __nv_tanhf(float); extern "C" { +double __ocml_acos_f64(double); float __ocml_acos_f32(float); +float16 __ocml_acos_f16(float16); float __ocml_acosh_f32(float); +float16 __ocml_acosh_f16(float16); +double __ocml_asin_f64(double); float __ocml_asin_f32(float); +float16 __ocml_asin_f16(float16); float __ocml_asinh_f32(float); +float16 __ocml_asinh_f16(float16); float __ocml_atan_f32(float); +float16 __ocml_atan_f16(float16); +float __ocml_atan2_f32(float, float); float __ocml_atanh_f32(float); +float16 __ocml_atanh_f16(float16); +double __ocml_cbrt_f64(double); float __ocml_cbrt_f32(float); +double __ocml_cos_f64(double); float __ocml_cos_f32(float); +float16 __ocml_cos_f16(float16); float __ocml_cosh_f32(float); +float16 __ocml_cosh_f16(float16); float __ocml_cospi_f32(float); float __ocml_erf_f32(float); +double __ocml_exp_f64(double); float __ocml_exp_f32(float); +float16 __ocml_exp_f16(float16); +double __ocml_exp10_f64(double); float __ocml_exp10_f32(float); +float16 __ocml_exp10_f16(float16); +double __ocml_exp2_f64(double); float __ocml_exp2_f32(float); +float16 __ocml_exp2_f16(float16); +double __ocml_expm1_f64(double); float __ocml_expm1_f32(float); +float16 __ocml_expm1_f16(float16); +double __ocml_hypot_f64(double, double); +float __ocml_hypot_f32(float, float); +double __ocml_log_f64(double); float __ocml_log_f32(float); +float16 __ocml_log_f16(float16); +double __ocml_log10_f64(double); float __ocml_log10_f32(float); +float16 __ocml_log10_f16(float16); +double __ocml_log1p_f64(double); float __ocml_log1p_f32(float); +double __ocml_log2_f64(double); float __ocml_log2_f32(float); +float16 __ocml_log2_f16(float16); +float __ocml_pow_f32(float, float); +float __ocml_round_f32(float); +double __ocml_sin_f64(double); float __ocml_sin_f32(float); +float16 __ocml_sin_f16(float16); +double __ocml_sincos_f64(double, double *); float __ocml_sincos_f32(float, float *); float __ocml_sinh_f32(float); +float16 __ocml_sinh_f16(float16); float __ocml_sinpi_f32(float); +double __ocml_tan_f64(double); float __ocml_tan_f32(float); +float16 __ocml_tan_f16(float16); float __ocml_tanh_f32(float); +float16 __ocml_tanh_f16(float16); } // extern "C" #endif // HIP_MATH_FOUND diff --git a/offload/unittests/Conformance/device_code/HIPMath.cpp b/offload/unittests/Conformance/device_code/HIPMath.cpp index 36efe6b2696a..7cc0ad5d9142 100644 --- a/offload/unittests/Conformance/device_code/HIPMath.cpp +++ b/offload/unittests/Conformance/device_code/HIPMath.cpp @@ -26,6 +26,22 @@ using namespace kernels; // Helpers //===----------------------------------------------------------------------===// +static inline float powfRoundedExponent(float Base, float Exponent) { + return __ocml_pow_f32(Base, __ocml_round_f32(Exponent)); +} + +static inline double sincosSin(double X) { + double CosX; + double SinX = __ocml_sincos_f64(X, &CosX); + return SinX; +} + +static inline double sincosCos(double X) { + double CosX; + double SinX = __ocml_sincos_f64(X, &CosX); + return CosX; +} + static inline float sincosfSin(float X) { float CosX; float SinX = __ocml_sincos_f32(X, &CosX); @@ -44,51 +60,116 @@ static inline float sincosfCos(float X) { extern "C" { +__gpu_kernel void acosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_acos_f64>(NumElements, Out, X); +} + __gpu_kernel void acosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_acos_f32>(NumElements, Out, X); } +__gpu_kernel void acosf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_acos_f16>(NumElements, Out, X); +} + __gpu_kernel void acoshfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_acosh_f32>(NumElements, Out, X); } +__gpu_kernel void acoshf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_acosh_f16>(NumElements, Out, X); +} + +__gpu_kernel void asinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_asin_f64>(NumElements, Out, X); +} + __gpu_kernel void asinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_asin_f32>(NumElements, Out, X); } +__gpu_kernel void asinf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_asin_f16>(NumElements, Out, X); +} + __gpu_kernel void asinhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_asinh_f32>(NumElements, Out, X); } +__gpu_kernel void asinhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_asinh_f16>(NumElements, Out, X); +} + __gpu_kernel void atanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_atan_f32>(NumElements, Out, X); } +__gpu_kernel void atanf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_atan_f16>(NumElements, Out, X); +} + +__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_atan2_f32>(NumElements, Out, X, Y); +} + __gpu_kernel void atanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_atanh_f32>(NumElements, Out, X); } +__gpu_kernel void atanhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_atanh_f16>(NumElements, Out, X); +} + +__gpu_kernel void cbrtKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cbrt_f64>(NumElements, Out, X); +} + __gpu_kernel void cbrtfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_cbrt_f32>(NumElements, Out, X); } +__gpu_kernel void cosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cos_f64>(NumElements, Out, X); +} + __gpu_kernel void cosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_cos_f32>(NumElements, Out, X); } +__gpu_kernel void cosf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cos_f16>(NumElements, Out, X); +} + __gpu_kernel void coshfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_cosh_f32>(NumElements, Out, X); } +__gpu_kernel void coshf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cosh_f16>(NumElements, Out, X); +} + __gpu_kernel void cospifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_cospi_f32>(NumElements, Out, X); @@ -99,51 +180,167 @@ __gpu_kernel void erffKernel(const float *X, float *Out, runKernelBody<__ocml_erf_f32>(NumElements, Out, X); } +__gpu_kernel void expKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp_f64>(NumElements, Out, X); +} + __gpu_kernel void expfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_exp_f32>(NumElements, Out, X); } +__gpu_kernel void expf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp_f16>(NumElements, Out, X); +} + +__gpu_kernel void exp10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp10_f64>(NumElements, Out, X); +} + __gpu_kernel void exp10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_exp10_f32>(NumElements, Out, X); } +__gpu_kernel void exp10f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp10_f16>(NumElements, Out, X); +} + +__gpu_kernel void exp2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp2_f64>(NumElements, Out, X); +} + __gpu_kernel void exp2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_exp2_f32>(NumElements, Out, X); } +__gpu_kernel void exp2f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp2_f16>(NumElements, Out, X); +} + +__gpu_kernel void expm1Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_expm1_f64>(NumElements, Out, X); +} + __gpu_kernel void expm1fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_expm1_f32>(NumElements, Out, X); } +__gpu_kernel void expm1f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_expm1_f16>(NumElements, Out, X); +} + +__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_hypot_f64>(NumElements, Out, X, Y); +} + +__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_hypot_f32>(NumElements, Out, X, Y); +} + +__gpu_kernel void logKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log_f64>(NumElements, Out, X); +} + __gpu_kernel void logfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_log_f32>(NumElements, Out, X); } +__gpu_kernel void logf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log_f16>(NumElements, Out, X); +} + +__gpu_kernel void log10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log10_f64>(NumElements, Out, X); +} + __gpu_kernel void log10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_log10_f32>(NumElements, Out, X); } +__gpu_kernel void log10f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log10_f16>(NumElements, Out, X); +} + +__gpu_kernel void log1pKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log1p_f64>(NumElements, Out, X); +} + __gpu_kernel void log1pfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_log1p_f32>(NumElements, Out, X); } +__gpu_kernel void log2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log2_f64>(NumElements, Out, X); +} + __gpu_kernel void log2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_log2_f32>(NumElements, Out, X); } +__gpu_kernel void log2f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log2_f16>(NumElements, Out, X); +} + +__gpu_kernel void powfKernel(const float *X, float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_pow_f32>(NumElements, Out, X, Y); +} + +__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y, + float *Out, + size_t NumElements) noexcept { + runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y); +} + +__gpu_kernel void sinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_sin_f64>(NumElements, Out, X); +} + __gpu_kernel void sinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_sin_f32>(NumElements, Out, X); } +__gpu_kernel void sinf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_sin_f16>(NumElements, Out, X); +} + +__gpu_kernel void sincosSinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosSin>(NumElements, Out, X); +} + +__gpu_kernel void sincosCosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosCos>(NumElements, Out, X); +} + __gpu_kernel void sincosfSinKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sincosfSin>(NumElements, Out, X); @@ -159,20 +356,40 @@ __gpu_kernel void sinhfKernel(const float *X, float *Out, runKernelBody<__ocml_sinh_f32>(NumElements, Out, X); } +__gpu_kernel void sinhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_sinh_f16>(NumElements, Out, X); +} + __gpu_kernel void sinpifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_sinpi_f32>(NumElements, Out, X); } +__gpu_kernel void tanKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_tan_f64>(NumElements, Out, X); +} + __gpu_kernel void tanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_tan_f32>(NumElements, Out, X); } +__gpu_kernel void tanf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_tan_f16>(NumElements, Out, X); +} + __gpu_kernel void tanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_tanh_f32>(NumElements, Out, X); } + +__gpu_kernel void tanhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_tanh_f16>(NumElements, Out, X); +} } // extern "C" #endif // HIP_MATH_FOUND diff --git a/offload/unittests/Conformance/device_code/LLVMLibm.cpp b/offload/unittests/Conformance/device_code/LLVMLibm.cpp index 8869d8701748..8673d809fd0a 100644 --- a/offload/unittests/Conformance/device_code/LLVMLibm.cpp +++ b/offload/unittests/Conformance/device_code/LLVMLibm.cpp @@ -25,6 +25,22 @@ using namespace kernels; // Helpers //===----------------------------------------------------------------------===// +static inline float powfRoundedExponent(float Base, float Exponent) { + return powf(Base, roundf(Exponent)); +} + +static inline double sincosSin(double X) { + double SinX, CosX; + sincos(X, &SinX, &CosX); + return SinX; +} + +static inline double sincosCos(double X) { + double SinX, CosX; + sincos(X, &SinX, &CosX); + return CosX; +} + static inline float sincosfSin(float X) { float SinX, CosX; sincosf(X, &SinX, &CosX); @@ -43,111 +59,302 @@ static inline float sincosfCos(float X) { extern "C" { +__gpu_kernel void acosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<acos>(NumElements, Out, X); +} + __gpu_kernel void acosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<acosf>(NumElements, Out, X); } +__gpu_kernel void acosf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<acosf16>(NumElements, Out, X); +} + __gpu_kernel void acoshfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<acoshf>(NumElements, Out, X); } +__gpu_kernel void acoshf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<acoshf16>(NumElements, Out, X); +} + +__gpu_kernel void acospif16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<acospif16>(NumElements, Out, X); +} + +__gpu_kernel void asinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<asin>(NumElements, Out, X); +} + __gpu_kernel void asinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<asinf>(NumElements, Out, X); } +__gpu_kernel void asinf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<asinf16>(NumElements, Out, X); +} + __gpu_kernel void asinhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<asinhf>(NumElements, Out, X); } +__gpu_kernel void asinhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<asinhf16>(NumElements, Out, X); +} + __gpu_kernel void atanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<atanf>(NumElements, Out, X); } +__gpu_kernel void atanf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<atanf16>(NumElements, Out, X); +} + +__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<atan2f>(NumElements, Out, X, Y); +} + __gpu_kernel void atanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<atanhf>(NumElements, Out, X); } +__gpu_kernel void atanhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<atanhf16>(NumElements, Out, X); +} + +__gpu_kernel void cbrtKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<cbrt>(NumElements, Out, X); +} + __gpu_kernel void cbrtfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<cbrtf>(NumElements, Out, X); } +__gpu_kernel void cosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<cos>(NumElements, Out, X); +} + __gpu_kernel void cosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<cosf>(NumElements, Out, X); } +__gpu_kernel void cosf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<cosf16>(NumElements, Out, X); +} + __gpu_kernel void coshfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<coshf>(NumElements, Out, X); } +__gpu_kernel void coshf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<coshf16>(NumElements, Out, X); +} + __gpu_kernel void cospifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<cospif>(NumElements, Out, X); } +__gpu_kernel void cospif16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<cospif16>(NumElements, Out, X); +} + __gpu_kernel void erffKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<erff>(NumElements, Out, X); } +__gpu_kernel void expKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<exp>(NumElements, Out, X); +} + __gpu_kernel void expfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<expf>(NumElements, Out, X); } +__gpu_kernel void expf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<expf16>(NumElements, Out, X); +} + +__gpu_kernel void exp10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<exp10>(NumElements, Out, X); +} + __gpu_kernel void exp10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<exp10f>(NumElements, Out, X); } +__gpu_kernel void exp10f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<exp10f16>(NumElements, Out, X); +} + +__gpu_kernel void exp2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<exp2>(NumElements, Out, X); +} + __gpu_kernel void exp2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<exp2f>(NumElements, Out, X); } +__gpu_kernel void exp2f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<exp2f16>(NumElements, Out, X); +} + +__gpu_kernel void expm1Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<expm1>(NumElements, Out, X); +} + __gpu_kernel void expm1fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<expm1f>(NumElements, Out, X); } -__gpu_kernel void hypotf16Kernel(const float16 *X, float16 *Y, float16 *Out, +__gpu_kernel void expm1f16Kernel(const float16 *X, float16 *Out, size_t NumElements) noexcept { + runKernelBody<expm1f16>(NumElements, Out, X); +} + +__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out, + size_t NumElements) noexcept { + runKernelBody<hypot>(NumElements, Out, X, Y); +} + +__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<hypotf>(NumElements, Out, X, Y); +} + +__gpu_kernel void hypotf16Kernel(const float16 *X, const float16 *Y, + float16 *Out, size_t NumElements) noexcept { runKernelBody<hypotf16>(NumElements, Out, X, Y); } +__gpu_kernel void logKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<log>(NumElements, Out, X); +} + __gpu_kernel void logfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<logf>(NumElements, Out, X); } +__gpu_kernel void logf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<logf16>(NumElements, Out, X); +} + +__gpu_kernel void log10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<log10>(NumElements, Out, X); +} + __gpu_kernel void log10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<log10f>(NumElements, Out, X); } +__gpu_kernel void log10f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<log10f16>(NumElements, Out, X); +} + +__gpu_kernel void log1pKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<log1p>(NumElements, Out, X); +} + __gpu_kernel void log1pfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<log1pf>(NumElements, Out, X); } +__gpu_kernel void log2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<log2>(NumElements, Out, X); +} + __gpu_kernel void log2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<log2f>(NumElements, Out, X); } +__gpu_kernel void log2f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<log2f16>(NumElements, Out, X); +} + +__gpu_kernel void powfKernel(const float *X, float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<powf>(NumElements, Out, X, Y); +} + +__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y, + float *Out, + size_t NumElements) noexcept { + runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y); +} + +__gpu_kernel void sinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sin>(NumElements, Out, X); +} + __gpu_kernel void sinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sinf>(NumElements, Out, X); } +__gpu_kernel void sinf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<sinf16>(NumElements, Out, X); +} + +__gpu_kernel void sincosSinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosSin>(NumElements, Out, X); +} + +__gpu_kernel void sincosCosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosCos>(NumElements, Out, X); +} + __gpu_kernel void sincosfSinKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sincosfSin>(NumElements, Out, X); @@ -163,23 +370,53 @@ __gpu_kernel void sinhfKernel(const float *X, float *Out, runKernelBody<sinhf>(NumElements, Out, X); } +__gpu_kernel void sinhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<sinhf16>(NumElements, Out, X); +} + __gpu_kernel void sinpifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sinpif>(NumElements, Out, X); } +__gpu_kernel void sinpif16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<sinpif16>(NumElements, Out, X); +} + +__gpu_kernel void tanKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<tan>(NumElements, Out, X); +} + __gpu_kernel void tanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<tanf>(NumElements, Out, X); } +__gpu_kernel void tanf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<tanf16>(NumElements, Out, X); +} + __gpu_kernel void tanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<tanhf>(NumElements, Out, X); } +__gpu_kernel void tanhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<tanhf16>(NumElements, Out, X); +} + __gpu_kernel void tanpifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<tanpif>(NumElements, Out, X); } + +__gpu_kernel void tanpif16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<tanpif16>(NumElements, Out, X); +} } // extern "C" diff --git a/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp b/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp index 6f7f7a9b665d..39c6838eecf7 100644 --- a/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp +++ b/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp @@ -8,8 +8,8 @@ /// /// \file /// This file contains the definition of the ExhaustiveGenerator class, a -/// concrete input generator that exhaustively creates inputs from a given -/// sequence of ranges. +/// concrete range-based generator that exhaustively creates inputs from a +/// given sequence of ranges. /// //===----------------------------------------------------------------------===// @@ -17,89 +17,62 @@ #define MATHTEST_EXHAUSTIVEGENERATOR_HPP #include "mathtest/IndexedRange.hpp" -#include "mathtest/InputGenerator.hpp" +#include "mathtest/RangeBasedGenerator.hpp" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/Support/Parallel.h" - -#include <algorithm> #include <array> #include <cassert> #include <cstddef> #include <cstdint> +#include <optional> #include <tuple> namespace mathtest { template <typename... InTypes> class [[nodiscard]] ExhaustiveGenerator final - : public InputGenerator<InTypes...> { - static constexpr std::size_t NumInputs = sizeof...(InTypes); - static_assert(NumInputs > 0, "The number of inputs must be at least 1"); + : public RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...> { + + friend class RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...>; + + using Base = RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...>; + using IndexArrayType = std::array<uint64_t, Base::NumInputs>; + + using Base::RangesTuple; + using Base::Size; public: explicit constexpr ExhaustiveGenerator( const IndexedRange<InTypes> &...Ranges) noexcept - : RangesTuple(Ranges...) { - bool Overflowed = getSizeWithOverflow(Ranges..., Size); + : Base(Ranges...) { + const auto MaybeSize = getInputSpaceSize(Ranges...); + + assert(MaybeSize.has_value() && "The size is too large"); + Size = *MaybeSize; - assert(!Overflowed && "The input space size is too large"); - assert((Size > 0) && "The input space size must be at least 1"); + assert((Size > 0) && "The size must be at least 1"); IndexArrayType DimSizes = {}; std::size_t DimIndex = 0; ((DimSizes[DimIndex++] = Ranges.getSize()), ...); - Strides[NumInputs - 1] = 1; - if constexpr (NumInputs > 1) - for (int Index = static_cast<int>(NumInputs) - 2; Index >= 0; --Index) + Strides[Base::NumInputs - 1] = 1; + if constexpr (Base::NumInputs > 1) + for (int Index = static_cast<int>(Base::NumInputs) - 2; Index >= 0; + --Index) Strides[Index] = Strides[Index + 1] * DimSizes[Index + 1]; } - void reset() noexcept override { NextFlatIndex = 0; } - - [[nodiscard]] std::size_t - fill(llvm::MutableArrayRef<InTypes>... Buffers) noexcept override { - const std::array<std::size_t, NumInputs> BufferSizes = {Buffers.size()...}; - const std::size_t BufferSize = BufferSizes[0]; - assert((BufferSize != 0) && "Buffer size cannot be zero"); - assert(std::all_of(BufferSizes.begin(), BufferSizes.end(), - [&](std::size_t Size) { return Size == BufferSize; }) && - "All input buffers must have the same size"); - - if (NextFlatIndex >= Size) - return 0; - - const auto BatchSize = std::min<uint64_t>(BufferSize, Size - NextFlatIndex); - const auto CurrentFlatIndex = NextFlatIndex; - NextFlatIndex += BatchSize; - - auto BufferPtrsTuple = std::make_tuple(Buffers.data()...); - - llvm::parallelFor(0, BatchSize, [&](std::size_t Offset) { - writeInputs(CurrentFlatIndex, Offset, BufferPtrsTuple); - }); - - return static_cast<std::size_t>(BatchSize); - } - private: - using RangesTupleType = std::tuple<IndexedRange<InTypes>...>; - using IndexArrayType = std::array<uint64_t, NumInputs>; - - static bool getSizeWithOverflow(const IndexedRange<InTypes> &...Ranges, - uint64_t &Size) noexcept { - Size = 1; - bool Overflowed = false; - - auto Multiplier = [&](const uint64_t RangeSize) { - if (!Overflowed) - Overflowed = __builtin_mul_overflow(Size, RangeSize, &Size); - }; + [[nodiscard]] constexpr IndexArrayType + getNDIndex(uint64_t FlatIndex) const noexcept { + IndexArrayType NDIndex; - (Multiplier(Ranges.getSize()), ...); + for (std::size_t Index = 0; Index < Base::NumInputs; ++Index) { + NDIndex[Index] = FlatIndex / Strides[Index]; + FlatIndex -= NDIndex[Index] * Strides[Index]; + } - return Overflowed; + return NDIndex; } template <typename BufferPtrsTupleType> @@ -109,31 +82,37 @@ private: writeInputsImpl<0>(NDIndex, Offset, BufferPtrsTuple); } - constexpr IndexArrayType getNDIndex(uint64_t FlatIndex) const noexcept { - IndexArrayType NDIndex; - - for (std::size_t Index = 0; Index < NumInputs; ++Index) { - NDIndex[Index] = FlatIndex / Strides[Index]; - FlatIndex -= NDIndex[Index] * Strides[Index]; - } - - return NDIndex; - } - template <std::size_t Index, typename BufferPtrsTupleType> void writeInputsImpl(IndexArrayType NDIndex, uint64_t Offset, BufferPtrsTupleType BufferPtrsTuple) const noexcept { - if constexpr (Index < NumInputs) { + if constexpr (Index < Base::NumInputs) { const auto &Range = std::get<Index>(RangesTuple); std::get<Index>(BufferPtrsTuple)[Offset] = Range[NDIndex[Index]]; + writeInputsImpl<Index + 1>(NDIndex, Offset, BufferPtrsTuple); } } - uint64_t Size = 1; - RangesTupleType RangesTuple; + [[nodiscard]] static constexpr std::optional<uint64_t> + getInputSpaceSize(const IndexedRange<InTypes> &...Ranges) noexcept { + uint64_t InputSpaceSize = 1; + bool Overflowed = false; + + auto Multiplier = [&](const uint64_t RangeSize) { + if (!Overflowed) + Overflowed = + __builtin_mul_overflow(InputSpaceSize, RangeSize, &InputSpaceSize); + }; + + (Multiplier(Ranges.getSize()), ...); + + if (Overflowed) + return std::nullopt; + + return InputSpaceSize; + } + IndexArrayType Strides = {}; - uint64_t NextFlatIndex = 0; }; } // namespace mathtest diff --git a/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp b/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp new file mode 100644 index 000000000000..436cd05f0a3d --- /dev/null +++ b/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the definition of the RandomGenerator class, a concrete +/// range-based generator that randomly creates inputs from a given sequence of +/// ranges. +/// +//===----------------------------------------------------------------------===// + +#ifndef MATHTEST_RANDOMGENERATOR_HPP +#define MATHTEST_RANDOMGENERATOR_HPP + +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/RangeBasedGenerator.hpp" + +#include <cstddef> +#include <cstdint> +#include <tuple> + +namespace mathtest { + +template <typename... InTypes> +class [[nodiscard]] RandomGenerator final + : public RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...> { + + friend class RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...>; + + using Base = RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...>; + + using Base::RangesTuple; + using Base::Size; + +public: + explicit constexpr RandomGenerator( + SeedTy BaseSeed, uint64_t Size, + const IndexedRange<InTypes> &...Ranges) noexcept + : Base(Size, Ranges...), BaseSeed(BaseSeed) {} + +private: + [[nodiscard]] static uint64_t getRandomIndex(RandomState &RNG, + uint64_t RangeSize) noexcept { + if (RangeSize == 0) + return 0; + + const uint64_t Threshold = (-RangeSize) % RangeSize; + + uint64_t RandomNumber; + do { + RandomNumber = RNG.next(); + } while (RandomNumber < Threshold); + + return RandomNumber % RangeSize; + } + + template <typename BufferPtrsTupleType> + void writeInputs(uint64_t CurrentFlatIndex, uint64_t Offset, + BufferPtrsTupleType BufferPtrsTuple) const noexcept { + + RandomState RNG(SeedTy{BaseSeed.Value ^ (CurrentFlatIndex + Offset)}); + writeInputsImpl<0>(RNG, Offset, BufferPtrsTuple); + } + + template <std::size_t Index, typename BufferPtrsTupleType> + void writeInputsImpl(RandomState &RNG, uint64_t Offset, + BufferPtrsTupleType BufferPtrsTuple) const noexcept { + if constexpr (Index < Base::NumInputs) { + const auto &Range = std::get<Index>(RangesTuple); + const auto RandomIndex = getRandomIndex(RNG, Range.getSize()); + std::get<Index>(BufferPtrsTuple)[Offset] = Range[RandomIndex]; + + writeInputsImpl<Index + 1>(RNG, Offset, BufferPtrsTuple); + } + } + + SeedTy BaseSeed; +}; +} // namespace mathtest + +#endif // MATHTEST_RANDOMGENERATOR_HPP diff --git a/offload/unittests/Conformance/include/mathtest/RandomState.hpp b/offload/unittests/Conformance/include/mathtest/RandomState.hpp new file mode 100644 index 000000000000..322d53175236 --- /dev/null +++ b/offload/unittests/Conformance/include/mathtest/RandomState.hpp @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the definition of the RandomState class, a fast and +/// lightweight pseudo-random number generator. +/// +/// The implementation is based on the xorshift* generator, seeded using the +/// SplitMix64 generator for robust initialization. For more details on the +/// algorithm, see: https://en.wikipedia.org/wiki/Xorshift +/// +//===----------------------------------------------------------------------===// + +#ifndef MATHTEST_RANDOMSTATE_HPP +#define MATHTEST_RANDOMSTATE_HPP + +#include <cstdint> + +struct SeedTy { + uint64_t Value; +}; + +class [[nodiscard]] RandomState { + uint64_t State; + + [[nodiscard]] static constexpr uint64_t splitMix64(uint64_t X) noexcept { + X += 0x9E3779B97F4A7C15ULL; + X = (X ^ (X >> 30)) * 0xBF58476D1CE4E5B9ULL; + X = (X ^ (X >> 27)) * 0x94D049BB133111EBULL; + X = (X ^ (X >> 31)); + return X ? X : 0x9E3779B97F4A7C15ULL; + } + +public: + explicit constexpr RandomState(SeedTy Seed) noexcept + : State(splitMix64(Seed.Value)) {} + + inline uint64_t next() noexcept { + uint64_t X = State; + X ^= X >> 12; + X ^= X << 25; + X ^= X >> 27; + State = X; + return X * 0x2545F4914F6CDD1DULL; + } +}; + +#endif // MATHTEST_RANDOMSTATE_HPP diff --git a/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp b/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp new file mode 100644 index 000000000000..5e1e1139aba9 --- /dev/null +++ b/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the definition of the RangeBasedGenerator class, a base +/// class for input generators that operate on a sequence of ranges. +/// +//===----------------------------------------------------------------------===// + +#ifndef MATHTEST_RANGEBASEDGENERATOR_HPP +#define MATHTEST_RANGEBASEDGENERATOR_HPP + +#include "mathtest/IndexedRange.hpp" +#include "mathtest/InputGenerator.hpp" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Parallel.h" + +#include <algorithm> +#include <array> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <tuple> + +namespace mathtest { + +template <typename Derived, typename... InTypes> +class [[nodiscard]] RangeBasedGenerator : public InputGenerator<InTypes...> { +public: + void reset() noexcept override { NextFlatIndex = 0; } + + [[nodiscard]] std::size_t + fill(llvm::MutableArrayRef<InTypes>... Buffers) noexcept override { + const std::array<std::size_t, NumInputs> BufferSizes = {Buffers.size()...}; + const std::size_t BufferSize = BufferSizes[0]; + assert((BufferSize != 0) && "Buffer size cannot be zero"); + assert(std::all_of(BufferSizes.begin(), BufferSizes.end(), + [&](std::size_t Size) { return Size == BufferSize; }) && + "All input buffers must have the same size"); + + if (NextFlatIndex >= Size) + return 0; + + const auto BatchSize = std::min<uint64_t>(BufferSize, Size - NextFlatIndex); + const auto CurrentFlatIndex = NextFlatIndex; + NextFlatIndex += BatchSize; + + auto BufferPtrsTuple = std::make_tuple(Buffers.data()...); + + llvm::parallelFor(0, BatchSize, [&](std::size_t Offset) { + static_cast<Derived *>(this)->writeInputs(CurrentFlatIndex, Offset, + BufferPtrsTuple); + }); + + return static_cast<std::size_t>(BatchSize); + } + +protected: + using RangesTupleType = std::tuple<IndexedRange<InTypes>...>; + + static constexpr std::size_t NumInputs = sizeof...(InTypes); + static_assert(NumInputs > 0, "The number of inputs must be at least 1"); + + explicit constexpr RangeBasedGenerator( + const IndexedRange<InTypes> &...Ranges) noexcept + : RangesTuple(Ranges...) {} + + explicit constexpr RangeBasedGenerator( + uint64_t Size, const IndexedRange<InTypes> &...Ranges) noexcept + : RangesTuple(Ranges...), Size(Size) {} + + RangesTupleType RangesTuple; + uint64_t Size = 0; + +private: + uint64_t NextFlatIndex = 0; +}; +} // namespace mathtest + +#endif // MATHTEST_RANGEBASEDGENERATOR_HPP diff --git a/offload/unittests/Conformance/lib/DeviceContext.cpp b/offload/unittests/Conformance/lib/DeviceContext.cpp index a0068c3cb59c..6c3425f1e17c 100644 --- a/offload/unittests/Conformance/lib/DeviceContext.cpp +++ b/offload/unittests/Conformance/lib/DeviceContext.cpp @@ -55,13 +55,14 @@ static OffloadInitWrapper Wrapper{}; [[nodiscard]] std::string getDeviceName(ol_device_handle_t DeviceHandle) { std::size_t PropSize = 0; - OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_NAME, &PropSize)); + OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME, + &PropSize)); if (PropSize == 0) return ""; std::string PropValue(PropSize, '\0'); - OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_NAME, PropSize, + OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME, PropSize, PropValue.data())); PropValue.pop_back(); // Remove the null terminator diff --git a/offload/unittests/Conformance/tests/AcosTest.cpp b/offload/unittests/Conformance/tests/AcosTest.cpp new file mode 100644 index 000000000000..bc0d1d2b7280 --- /dev/null +++ b/offload/unittests/Conformance/tests/AcosTest.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the acos function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'acos' function to select the double version +constexpr auto acosd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(acos); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<acosd> { + static constexpr llvm::StringRef Name = "acos"; + static constexpr llvm::StringRef KernelName = "acosKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the acos function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/-1.0, + /*End=*/1.0, + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<acosd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Acosf16Test.cpp b/offload/unittests/Conformance/tests/Acosf16Test.cpp new file mode 100644 index 000000000000..ce11cc2aa1e8 --- /dev/null +++ b/offload/unittests/Conformance/tests/Acosf16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the acosf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 acosf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<acosf16> { + static constexpr llvm::StringRef Name = "acosf16"; + static constexpr llvm::StringRef KernelName = "acosf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the acosf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(-1.0), + /*End=*/float16(1.0), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<acosf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/AcosfTest.cpp b/offload/unittests/Conformance/tests/AcosfTest.cpp index e69ee3b7d1fd..65b2d18d7728 100644 --- a/offload/unittests/Conformance/tests/AcosfTest.cpp +++ b/offload/unittests/Conformance/tests/AcosfTest.cpp @@ -40,7 +40,9 @@ int main(int argc, const char **argv) { using namespace mathtest; - IndexedRange<float> Range; + IndexedRange<float> Range(/*Begin=*/-1.0f, + /*End=*/1.0f, + /*Inclusive=*/true); ExhaustiveGenerator<float> Generator(Range); const auto Configs = cl::getTestConfigs(); diff --git a/offload/unittests/Conformance/tests/Acoshf16Test.cpp b/offload/unittests/Conformance/tests/Acoshf16Test.cpp new file mode 100644 index 000000000000..80434477aa43 --- /dev/null +++ b/offload/unittests/Conformance/tests/Acoshf16Test.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the acoshf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/Numerics.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 acoshf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<acoshf16> { + static constexpr llvm::StringRef Name = "acoshf16"; + static constexpr llvm::StringRef KernelName = "acoshf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the acoshf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(1.0), + /*End=*/getMaxOrInf<float16>(), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<acoshf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Acospif16Test.cpp b/offload/unittests/Conformance/tests/Acospif16Test.cpp new file mode 100644 index 000000000000..c5871e27cafc --- /dev/null +++ b/offload/unittests/Conformance/tests/Acospif16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the acospif16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 acospif16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<acospif16> { + static constexpr llvm::StringRef Name = "acospif16"; + static constexpr llvm::StringRef KernelName = "acospif16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the acospif16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(-1.0), + /*End=*/float16(1.0), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<acospif16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/AsinTest.cpp b/offload/unittests/Conformance/tests/AsinTest.cpp new file mode 100644 index 000000000000..aaaa37af02bc --- /dev/null +++ b/offload/unittests/Conformance/tests/AsinTest.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the asin function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'asin' function to select the double version +constexpr auto asind // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(asin); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<asind> { + static constexpr llvm::StringRef Name = "asin"; + static constexpr llvm::StringRef KernelName = "asinKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the asin function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/-1.0, + /*End=*/1.0, + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<asind>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Asinf16Test.cpp b/offload/unittests/Conformance/tests/Asinf16Test.cpp new file mode 100644 index 000000000000..5784d6bfe08e --- /dev/null +++ b/offload/unittests/Conformance/tests/Asinf16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the asinf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 asinf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<asinf16> { + static constexpr llvm::StringRef Name = "asinf16"; + static constexpr llvm::StringRef KernelName = "asinf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the asinf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(-1.0), + /*End=*/float16(1.0), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<asinf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/AsinfTest.cpp b/offload/unittests/Conformance/tests/AsinfTest.cpp index 991f79b111ef..aeee648b5fa0 100644 --- a/offload/unittests/Conformance/tests/AsinfTest.cpp +++ b/offload/unittests/Conformance/tests/AsinfTest.cpp @@ -40,7 +40,9 @@ int main(int argc, const char **argv) { using namespace mathtest; - IndexedRange<float> Range; + IndexedRange<float> Range(/*Begin=*/-1.0f, + /*End=*/1.0f, + /*Inclusive=*/true); ExhaustiveGenerator<float> Generator(Range); const auto Configs = cl::getTestConfigs(); diff --git a/offload/unittests/Conformance/tests/Asinhf16Test.cpp b/offload/unittests/Conformance/tests/Asinhf16Test.cpp new file mode 100644 index 000000000000..0af9bcb06fef --- /dev/null +++ b/offload/unittests/Conformance/tests/Asinhf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the asinhf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 asinhf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<asinhf16> { + static constexpr llvm::StringRef Name = "asinhf16"; + static constexpr llvm::StringRef KernelName = "asinhf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the asinhf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<asinhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Atan2fTest.cpp b/offload/unittests/Conformance/tests/Atan2fTest.cpp new file mode 100644 index 000000000000..4a46f9a61540 --- /dev/null +++ b/offload/unittests/Conformance/tests/Atan2fTest.cpp @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the atan2f function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace mathtest { + +template <> struct FunctionConfig<atan2f> { + static constexpr llvm::StringRef Name = "atan2f"; + static constexpr llvm::StringRef KernelName = "atan2fKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 65, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 6; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the atan2f function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<float> RangeX; + IndexedRange<float> RangeY; + RandomGenerator<float, float> Generator(SeedTy{Seed}, Size, RangeX, RangeY); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<atan2f>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Atanf16Test.cpp b/offload/unittests/Conformance/tests/Atanf16Test.cpp new file mode 100644 index 000000000000..3d3fa384e84d --- /dev/null +++ b/offload/unittests/Conformance/tests/Atanf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the atanf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 atanf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<atanf16> { + static constexpr llvm::StringRef Name = "atanf16"; + static constexpr llvm::StringRef KernelName = "atanf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the atanf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<atanf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Atanhf16Test.cpp b/offload/unittests/Conformance/tests/Atanhf16Test.cpp new file mode 100644 index 000000000000..86a0f82ce376 --- /dev/null +++ b/offload/unittests/Conformance/tests/Atanhf16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the atanhf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 atanhf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<atanhf16> { + static constexpr llvm::StringRef Name = "atanhf16"; + static constexpr llvm::StringRef KernelName = "atanhf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the atanhf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(-1.0), + /*End=*/float16(1.0), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<atanhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/CMakeLists.txt b/offload/unittests/Conformance/tests/CMakeLists.txt index 8c0109ba62ce..ad94df8e7978 100644 --- a/offload/unittests/Conformance/tests/CMakeLists.txt +++ b/offload/unittests/Conformance/tests/CMakeLists.txt @@ -3,30 +3,72 @@ if(NOT TARGET libc) return() endif() +add_conformance_test(acos AcosTest.cpp) add_conformance_test(acosf AcosfTest.cpp) +add_conformance_test(acosf16 Acosf16Test.cpp) add_conformance_test(acoshf AcoshfTest.cpp) +add_conformance_test(acoshf16 Acoshf16Test.cpp) +add_conformance_test(acospif16 Acospif16Test.cpp) +add_conformance_test(asin AsinTest.cpp) add_conformance_test(asinf AsinfTest.cpp) +add_conformance_test(asinf16 Asinf16Test.cpp) add_conformance_test(asinhf AsinhfTest.cpp) +add_conformance_test(asinhf16 Asinhf16Test.cpp) add_conformance_test(atanf AtanfTest.cpp) +add_conformance_test(atanf16 Atanf16Test.cpp) +add_conformance_test(atan2f Atan2fTest.cpp) add_conformance_test(atanhf AtanhfTest.cpp) +add_conformance_test(atanhf16 Atanhf16Test.cpp) +add_conformance_test(cbrt CbrtTest.cpp) add_conformance_test(cbrtf CbrtfTest.cpp) +add_conformance_test(cos CosTest.cpp) add_conformance_test(cosf CosfTest.cpp) +add_conformance_test(cosf16 Cosf16Test.cpp) add_conformance_test(coshf CoshfTest.cpp) +add_conformance_test(coshf16 Coshf16Test.cpp) add_conformance_test(cospif CospifTest.cpp) +add_conformance_test(cospif16 Cospif16Test.cpp) add_conformance_test(erff ErffTest.cpp) +add_conformance_test(exp ExpTest.cpp) add_conformance_test(expf ExpfTest.cpp) +add_conformance_test(expf16 Expf16Test.cpp) +add_conformance_test(exp10 Exp10Test.cpp) add_conformance_test(exp10f Exp10fTest.cpp) +add_conformance_test(exp10f16 Exp10f16Test.cpp) +add_conformance_test(exp2 Exp2Test.cpp) add_conformance_test(exp2f Exp2fTest.cpp) +add_conformance_test(exp2f16 Exp2f16Test.cpp) +add_conformance_test(expm1 Expm1Test.cpp) add_conformance_test(expm1f Expm1fTest.cpp) +add_conformance_test(expm1f16 Expm1f16Test.cpp) +add_conformance_test(hypot HypotTest.cpp) +add_conformance_test(hypotf HypotfTest.cpp) add_conformance_test(hypotf16 Hypotf16Test.cpp) +add_conformance_test(log LogTest.cpp) add_conformance_test(logf LogfTest.cpp) +add_conformance_test(logf16 Logf16Test.cpp) +add_conformance_test(log10 Log10Test.cpp) add_conformance_test(log10f Log10fTest.cpp) +add_conformance_test(log10f16 Log10f16Test.cpp) +add_conformance_test(log1p Log1pTest.cpp) add_conformance_test(log1pf Log1pfTest.cpp) +add_conformance_test(log2 Log2Test.cpp) add_conformance_test(log2f Log2fTest.cpp) +add_conformance_test(log2f16 Log2f16Test.cpp) +add_conformance_test(powf PowfTest.cpp) +add_conformance_test(sin SinTest.cpp) add_conformance_test(sinf SinfTest.cpp) +add_conformance_test(sinf16 Sinf16Test.cpp) +add_conformance_test(sincos SincosTest.cpp) add_conformance_test(sincosf SincosfTest.cpp) add_conformance_test(sinhf SinhfTest.cpp) +add_conformance_test(sinhf16 Sinhf16Test.cpp) add_conformance_test(sinpif SinpifTest.cpp) +add_conformance_test(sinpif16 Sinpif16Test.cpp) +add_conformance_test(tan TanTest.cpp) add_conformance_test(tanf TanfTest.cpp) +add_conformance_test(tanf16 Tanf16Test.cpp) add_conformance_test(tanhf TanhfTest.cpp) +add_conformance_test(tanhf16 Tanhf16Test.cpp) add_conformance_test(tanpif TanpifTest.cpp) +add_conformance_test(tanpif16 Tanpif16Test.cpp) diff --git a/offload/unittests/Conformance/tests/CbrtTest.cpp b/offload/unittests/Conformance/tests/CbrtTest.cpp new file mode 100644 index 000000000000..3a6523b66ad8 --- /dev/null +++ b/offload/unittests/Conformance/tests/CbrtTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the cbrt function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'cbrt' function to select the double version +constexpr auto cbrtd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(cbrt); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<cbrtd> { + static constexpr llvm::StringRef Name = "cbrt"; + static constexpr llvm::StringRef KernelName = "cbrtKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the cbrt function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<cbrtd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/CosTest.cpp b/offload/unittests/Conformance/tests/CosTest.cpp new file mode 100644 index 000000000000..e3d3d3da8180 --- /dev/null +++ b/offload/unittests/Conformance/tests/CosTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the cos function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'cos' function to select the double version +constexpr auto cosd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(cos); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<cosd> { + static constexpr llvm::StringRef Name = "cos"; + static constexpr llvm::StringRef KernelName = "cosKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the cos function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<cosd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Cosf16Test.cpp b/offload/unittests/Conformance/tests/Cosf16Test.cpp new file mode 100644 index 000000000000..680e4b99c549 --- /dev/null +++ b/offload/unittests/Conformance/tests/Cosf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the cosf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 cosf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<cosf16> { + static constexpr llvm::StringRef Name = "cosf16"; + static constexpr llvm::StringRef KernelName = "cosf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the cosf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<cosf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Coshf16Test.cpp b/offload/unittests/Conformance/tests/Coshf16Test.cpp new file mode 100644 index 000000000000..1b378b5a9401 --- /dev/null +++ b/offload/unittests/Conformance/tests/Coshf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the coshf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 coshf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<coshf16> { + static constexpr llvm::StringRef Name = "coshf16"; + static constexpr llvm::StringRef KernelName = "coshf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the coshf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<coshf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Cospif16Test.cpp b/offload/unittests/Conformance/tests/Cospif16Test.cpp new file mode 100644 index 000000000000..84aa682b4884 --- /dev/null +++ b/offload/unittests/Conformance/tests/Cospif16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the cospif16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 cospif16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<cospif16> { + static constexpr llvm::StringRef Name = "cospif16"; + static constexpr llvm::StringRef KernelName = "cospif16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the cospif16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<cospif16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Exp10Test.cpp b/offload/unittests/Conformance/tests/Exp10Test.cpp new file mode 100644 index 000000000000..05af4780213b --- /dev/null +++ b/offload/unittests/Conformance/tests/Exp10Test.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp10 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'exp10' function to select the double version +constexpr auto exp10d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(exp10); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<exp10d> { + static constexpr llvm::StringRef Name = "exp10"; + static constexpr llvm::StringRef KernelName = "exp10Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the exp10 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<exp10d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Exp10f16Test.cpp b/offload/unittests/Conformance/tests/Exp10f16Test.cpp new file mode 100644 index 000000000000..7d61ad0c6aef --- /dev/null +++ b/offload/unittests/Conformance/tests/Exp10f16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp10f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 exp10f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<exp10f16> { + static constexpr llvm::StringRef Name = "exp10f16"; + static constexpr llvm::StringRef KernelName = "exp10f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the exp10f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<exp10f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Exp2Test.cpp b/offload/unittests/Conformance/tests/Exp2Test.cpp new file mode 100644 index 000000000000..bb2fa10a0dfa --- /dev/null +++ b/offload/unittests/Conformance/tests/Exp2Test.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp2 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'exp2' function to select the double version +constexpr auto exp2d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(exp2); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<exp2d> { + static constexpr llvm::StringRef Name = "exp2"; + static constexpr llvm::StringRef KernelName = "exp2Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the exp2 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<exp2d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Exp2f16Test.cpp b/offload/unittests/Conformance/tests/Exp2f16Test.cpp new file mode 100644 index 000000000000..9ea92564e738 --- /dev/null +++ b/offload/unittests/Conformance/tests/Exp2f16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp2f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 exp2f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<exp2f16> { + static constexpr llvm::StringRef Name = "exp2f16"; + static constexpr llvm::StringRef KernelName = "exp2f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the exp2f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<exp2f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/ExpTest.cpp b/offload/unittests/Conformance/tests/ExpTest.cpp new file mode 100644 index 000000000000..9aa52b17905e --- /dev/null +++ b/offload/unittests/Conformance/tests/ExpTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'exp' function to select the double version +constexpr auto expd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(exp); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<expd> { + static constexpr llvm::StringRef Name = "exp"; + static constexpr llvm::StringRef KernelName = "expKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the exp function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<expd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Expf16Test.cpp b/offload/unittests/Conformance/tests/Expf16Test.cpp new file mode 100644 index 000000000000..8938815e26a8 --- /dev/null +++ b/offload/unittests/Conformance/tests/Expf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the expf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 expf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<expf16> { + static constexpr llvm::StringRef Name = "expf16"; + static constexpr llvm::StringRef KernelName = "expf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the expf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<expf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Expm1Test.cpp b/offload/unittests/Conformance/tests/Expm1Test.cpp new file mode 100644 index 000000000000..a27944bf722f --- /dev/null +++ b/offload/unittests/Conformance/tests/Expm1Test.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the expm1 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'expm1' function to select the double version +constexpr auto expm1d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(expm1); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<expm1d> { + static constexpr llvm::StringRef Name = "expm1"; + static constexpr llvm::StringRef KernelName = "expm1Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the expm1 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<expm1d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Expm1f16Test.cpp b/offload/unittests/Conformance/tests/Expm1f16Test.cpp new file mode 100644 index 000000000000..447196bb8ea3 --- /dev/null +++ b/offload/unittests/Conformance/tests/Expm1f16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the expm1f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 expm1f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<expm1f16> { + static constexpr llvm::StringRef Name = "expm1f16"; + static constexpr llvm::StringRef KernelName = "expm1f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the expm1f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<expm1f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/HypotTest.cpp b/offload/unittests/Conformance/tests/HypotTest.cpp new file mode 100644 index 000000000000..0417ad901d5e --- /dev/null +++ b/offload/unittests/Conformance/tests/HypotTest.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the hypot function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'hypot' function to select the double version +constexpr auto hypotd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double, double)>(hypot); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<hypotd> { + static constexpr llvm::StringRef Name = "hypot"; + static constexpr llvm::StringRef KernelName = "hypotKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the hypot function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> RangeX; + IndexedRange<double> RangeY; + RandomGenerator<double, double> Generator(SeedTy{Seed}, Size, RangeX, RangeY); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<hypotd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/HypotfTest.cpp b/offload/unittests/Conformance/tests/HypotfTest.cpp new file mode 100644 index 000000000000..98a4e906920d --- /dev/null +++ b/offload/unittests/Conformance/tests/HypotfTest.cpp @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the hypotf function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace mathtest { + +template <> struct FunctionConfig<hypotf> { + static constexpr llvm::StringRef Name = "hypotf"; + static constexpr llvm::StringRef KernelName = "hypotfKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 65, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the hypotf function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<float> RangeX; + IndexedRange<float> RangeY; + RandomGenerator<float, float> Generator(SeedTy{Seed}, Size, RangeX, RangeY); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<hypotf>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log10Test.cpp b/offload/unittests/Conformance/tests/Log10Test.cpp new file mode 100644 index 000000000000..bf46f11e960b --- /dev/null +++ b/offload/unittests/Conformance/tests/Log10Test.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log10 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <limits> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'log10' function to select the double version +constexpr auto log10d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(log10); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<log10d> { + static constexpr llvm::StringRef Name = "log10"; + static constexpr llvm::StringRef KernelName = "log10Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log10 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/0.0, + /*End=*/std::numeric_limits<double>::infinity(), + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<log10d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log10f16Test.cpp b/offload/unittests/Conformance/tests/Log10f16Test.cpp new file mode 100644 index 000000000000..605e1ae49077 --- /dev/null +++ b/offload/unittests/Conformance/tests/Log10f16Test.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log10f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/Numerics.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 log10f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<log10f16> { + static constexpr llvm::StringRef Name = "log10f16"; + static constexpr llvm::StringRef KernelName = "log10f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the log10f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(0.0), + /*End=*/getMaxOrInf<float16>(), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<log10f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log1pTest.cpp b/offload/unittests/Conformance/tests/Log1pTest.cpp new file mode 100644 index 000000000000..023b67e770de --- /dev/null +++ b/offload/unittests/Conformance/tests/Log1pTest.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log1p function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <limits> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'log1p' function to select the double version +constexpr auto log1pd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(log1p); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<log1pd> { + static constexpr llvm::StringRef Name = "log1p"; + static constexpr llvm::StringRef KernelName = "log1pKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log1p function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/-1.0, + /*End=*/std::numeric_limits<double>::infinity(), + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<log1pd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log2Test.cpp b/offload/unittests/Conformance/tests/Log2Test.cpp new file mode 100644 index 000000000000..2ae7e5c23292 --- /dev/null +++ b/offload/unittests/Conformance/tests/Log2Test.cpp @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log2 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <limits> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'log2' function to select the double version +constexpr auto log2d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(log2); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<log2d> { + static constexpr llvm::StringRef Name = "log2"; + static constexpr llvm::StringRef KernelName = "log2Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log2 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/0.0, + /*End=*/std::numeric_limits<double>::infinity(), + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<log2d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log2f16Test.cpp b/offload/unittests/Conformance/tests/Log2f16Test.cpp new file mode 100644 index 000000000000..5ce46960774a --- /dev/null +++ b/offload/unittests/Conformance/tests/Log2f16Test.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log2f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/Numerics.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 log2f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<log2f16> { + static constexpr llvm::StringRef Name = "log2f16"; + static constexpr llvm::StringRef KernelName = "log2f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log2f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(0.0), + /*End=*/getMaxOrInf<float16>(), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<log2f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/LogTest.cpp b/offload/unittests/Conformance/tests/LogTest.cpp new file mode 100644 index 000000000000..ae568e2c4740 --- /dev/null +++ b/offload/unittests/Conformance/tests/LogTest.cpp @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <limits> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'log' function to select the double version +constexpr auto logd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(log); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<logd> { + static constexpr llvm::StringRef Name = "log"; + static constexpr llvm::StringRef KernelName = "logKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/0.0, + /*End=*/std::numeric_limits<double>::infinity(), + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<logd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Logf16Test.cpp b/offload/unittests/Conformance/tests/Logf16Test.cpp new file mode 100644 index 000000000000..372dccb2ebb9 --- /dev/null +++ b/offload/unittests/Conformance/tests/Logf16Test.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the logf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/Numerics.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 logf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<logf16> { + static constexpr llvm::StringRef Name = "logf16"; + static constexpr llvm::StringRef KernelName = "logf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the logf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(0.0), + /*End=*/getMaxOrInf<float16>(), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<logf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/PowfTest.cpp b/offload/unittests/Conformance/tests/PowfTest.cpp new file mode 100644 index 000000000000..246801e390ae --- /dev/null +++ b/offload/unittests/Conformance/tests/PowfTest.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the powf function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +static inline float powfRoundedExponent(float Base, float Exponent) { + return powf(Base, roundf(Exponent)); +} + +namespace mathtest { + +template <> struct FunctionConfig<powf> { + static constexpr llvm::StringRef Name = "powf (real exponents)"; + static constexpr llvm::StringRef KernelName = "powfKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 65, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 16; +}; + +template <> struct FunctionConfig<powfRoundedExponent> { + static constexpr llvm::StringRef Name = "powf (integer exponents)"; + static constexpr llvm::StringRef KernelName = "powfRoundedExponentKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 65, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 16; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the powf function"); + + using namespace mathtest; + + uint64_t Size = 1ULL << 32; + IndexedRange<float> RangeX; + IndexedRange<float> RangeY; + RandomGenerator<float, float> Generator0(SeedTy{42}, Size, RangeX, RangeY); + RandomGenerator<float, float> Generator1(SeedTy{51}, Size, RangeX, RangeY); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool RealExponentsPassed = + runTests<powf>(Generator0, Configs, DeviceBinaryDir, IsVerbose); + bool IntegerExponentsPassed = runTests<powfRoundedExponent>( + Generator1, Configs, DeviceBinaryDir, IsVerbose); + + return (RealExponentsPassed && IntegerExponentsPassed) ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/SinTest.cpp b/offload/unittests/Conformance/tests/SinTest.cpp new file mode 100644 index 000000000000..36897d74c96a --- /dev/null +++ b/offload/unittests/Conformance/tests/SinTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sin function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'sin' function to select the double version +constexpr auto sind // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(sin); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<sind> { + static constexpr llvm::StringRef Name = "sin"; + static constexpr llvm::StringRef KernelName = "sinKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the sin function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<sind>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/SincosTest.cpp b/offload/unittests/Conformance/tests/SincosTest.cpp new file mode 100644 index 000000000000..a3d1650c54e4 --- /dev/null +++ b/offload/unittests/Conformance/tests/SincosTest.cpp @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sincos function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +static inline double sincosSin(double X) { + double SinX, CosX; + sincos(X, &SinX, &CosX); + return SinX; +} + +static inline double sincosCos(double X) { + double SinX, CosX; + sincos(X, &SinX, &CosX); + return CosX; +} + +namespace mathtest { + +template <> struct FunctionConfig<sincosSin> { + static constexpr llvm::StringRef Name = "sincos (sin part)"; + static constexpr llvm::StringRef KernelName = "sincosSinKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; + +template <> struct FunctionConfig<sincosCos> { + static constexpr llvm::StringRef Name = "sincos (cos part)"; + static constexpr llvm::StringRef KernelName = "sincosCosKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the sincos function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool SinPartPassed = + runTests<sincosSin>(Generator, Configs, DeviceBinaryDir, IsVerbose); + bool CosPartPassed = + runTests<sincosCos>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return (SinPartPassed && CosPartPassed) ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Sinf16Test.cpp b/offload/unittests/Conformance/tests/Sinf16Test.cpp new file mode 100644 index 000000000000..4c5fb2226288 --- /dev/null +++ b/offload/unittests/Conformance/tests/Sinf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sinf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 sinf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<sinf16> { + static constexpr llvm::StringRef Name = "sinf16"; + static constexpr llvm::StringRef KernelName = "sinf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the sinf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<sinf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Sinhf16Test.cpp b/offload/unittests/Conformance/tests/Sinhf16Test.cpp new file mode 100644 index 000000000000..fe6f7dd4a49c --- /dev/null +++ b/offload/unittests/Conformance/tests/Sinhf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sinhf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 sinhf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<sinhf16> { + static constexpr llvm::StringRef Name = "sinhf16"; + static constexpr llvm::StringRef KernelName = "sinhf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the sinhf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<sinhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Sinpif16Test.cpp b/offload/unittests/Conformance/tests/Sinpif16Test.cpp new file mode 100644 index 000000000000..ff9c93c0bb5b --- /dev/null +++ b/offload/unittests/Conformance/tests/Sinpif16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sinpif16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 sinpif16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<sinpif16> { + static constexpr llvm::StringRef Name = "sinpif16"; + static constexpr llvm::StringRef KernelName = "sinpif16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the sinpif16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<sinpif16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/TanTest.cpp b/offload/unittests/Conformance/tests/TanTest.cpp new file mode 100644 index 000000000000..3a9a05874450 --- /dev/null +++ b/offload/unittests/Conformance/tests/TanTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the tan function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'tan' function to select the double version +constexpr auto tand // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(tan); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<tand> { + static constexpr llvm::StringRef Name = "tan"; + static constexpr llvm::StringRef KernelName = "tanKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 5; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the tan function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<tand>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Tanf16Test.cpp b/offload/unittests/Conformance/tests/Tanf16Test.cpp new file mode 100644 index 000000000000..eae9818830a2 --- /dev/null +++ b/offload/unittests/Conformance/tests/Tanf16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the tanf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 tanf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<tanf16> { + static constexpr llvm::StringRef Name = "tanf16"; + static constexpr llvm::StringRef KernelName = "tanf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + // Note: The minimum accuracy at the source is 2.5 ULP, but we round it + // down to ensure conformance. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the tanf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<tanf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Tanhf16Test.cpp b/offload/unittests/Conformance/tests/Tanhf16Test.cpp new file mode 100644 index 000000000000..1a11f3da7f09 --- /dev/null +++ b/offload/unittests/Conformance/tests/Tanhf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the tanhf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 tanhf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<tanhf16> { + static constexpr llvm::StringRef Name = "tanhf16"; + static constexpr llvm::StringRef KernelName = "tanhf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the tanhf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<tanhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Tanpif16Test.cpp b/offload/unittests/Conformance/tests/Tanpif16Test.cpp new file mode 100644 index 000000000000..76374807b92f --- /dev/null +++ b/offload/unittests/Conformance/tests/Tanpif16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the tanpif16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 tanpif16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<tanpif16> { + static constexpr llvm::StringRef Name = "tanpif16"; + static constexpr llvm::StringRef KernelName = "tanpif16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the tanpif16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<tanpif16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt index 8f0267eb39bd..50c99a5d5b63 100644 --- a/offload/unittests/OffloadAPI/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/CMakeLists.txt @@ -20,12 +20,16 @@ add_offload_unittest("init" target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER) add_offload_unittest("kernel" + kernel/olCalculateOptimalOccupancy.cpp kernel/olLaunchKernel.cpp) add_offload_unittest("memory" memory/olMemAlloc.cpp + memory/olMemFill.cpp memory/olMemFree.cpp - memory/olMemcpy.cpp) + memory/olMemcpy.cpp + memory/olGetMemInfo.cpp + memory/olGetMemInfoSize.cpp) add_offload_unittest("platform" platform/olGetPlatformInfo.cpp @@ -33,6 +37,7 @@ add_offload_unittest("platform" add_offload_unittest("program" program/olCreateProgram.cpp + program/olIsValidBinary.cpp program/olDestroyProgram.cpp) add_offload_unittest("queue" @@ -41,7 +46,8 @@ add_offload_unittest("queue" queue/olDestroyQueue.cpp queue/olGetQueueInfo.cpp queue/olGetQueueInfoSize.cpp - queue/olWaitEvents.cpp) + queue/olWaitEvents.cpp + queue/olLaunchHostFunction.cpp) add_offload_unittest("symbol" symbol/olGetSymbol.cpp diff --git a/offload/unittests/OffloadAPI/common/Environment.cpp b/offload/unittests/OffloadAPI/common/Environment.cpp index ef092cd4187d..c9da6ef9be7c 100644 --- a/offload/unittests/OffloadAPI/common/Environment.cpp +++ b/offload/unittests/OffloadAPI/common/Environment.cpp @@ -41,9 +41,9 @@ raw_ostream &operator<<(raw_ostream &Out, raw_ostream &operator<<(raw_ostream &Out, const ol_device_handle_t &Device) { size_t Size; - olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size); + olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size); std::vector<char> Name(Size); - olGetDeviceInfo(Device, OL_DEVICE_INFO_NAME, Size, Name.data()); + olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data()); Out << Name.data(); return Out; } @@ -129,6 +129,9 @@ const std::vector<TestEnvironment::Device> &TestEnvironment::getDevices() { } } + if (Devices.size() == 0) + errs() << "Warning: No devices found for OffloadAPI tests.\n"; + return Devices; } diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp index 43240fa3c4a0..6f9961e2c6d5 100644 --- a/offload/unittests/OffloadAPI/common/Fixtures.hpp +++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp @@ -26,12 +26,30 @@ } while (0) #endif -// TODO: rework this so the EXPECTED/ACTUAL results are readable +#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED +#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL) \ + do { \ + ol_result_t Res = ACTUAL; \ + if (Res && Res->Code == OL_ERRC_UNSUPPORTED) { \ + GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test"; \ + return; \ + } else if (Res && Res->Code != OL_ERRC_SUCCESS) { \ + GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": " \ + << Res->Details; \ + } \ + } while (0) +#endif + #ifndef ASSERT_ERROR #define ASSERT_ERROR(EXPECTED, ACTUAL) \ do { \ ol_result_t Res = ACTUAL; \ - ASSERT_TRUE(Res && (Res->Code == EXPECTED)); \ + if (!Res) \ + GTEST_FAIL() << #ACTUAL " succeeded when we expected it to fail"; \ + if (Res->Code != EXPECTED) \ + GTEST_FAIL() << #ACTUAL " was expected to return " \ + << #EXPECTED " but instead returned " << Res->Code << ": " \ + << Res->Details; \ } while (0) #endif @@ -75,6 +93,40 @@ template <typename Fn> inline void threadify(Fn body) { } } +/// Enqueues a task to the queue that can be manually resolved. +// It will block until `trigger` is called. +struct ManuallyTriggeredTask { + std::mutex M; + std::condition_variable CV; + bool Flag = false; + ol_event_handle_t CompleteEvent; + + ol_result_t enqueue(ol_queue_handle_t Queue) { + if (auto Err = olLaunchHostFunction( + Queue, + [](void *That) { + static_cast<ManuallyTriggeredTask *>(That)->wait(); + }, + this)) + return Err; + + return olCreateEvent(Queue, &CompleteEvent); + } + + void wait() { + std::unique_lock<std::mutex> lk(M); + CV.wait_for(lk, std::chrono::milliseconds(1000), [&] { return Flag; }); + EXPECT_TRUE(Flag); + } + + ol_result_t trigger() { + Flag = true; + CV.notify_one(); + + return olSyncEvent(CompleteEvent); + } +}; + struct OffloadTest : ::testing::Test { ol_device_handle_t Host = TestEnvironment::getHostDevice(); }; @@ -202,9 +254,13 @@ struct OffloadEventTest : OffloadQueueTest { ol_event_handle_t Event = nullptr; }; +// Devices might not be available for offload testing, so allow uninstantiated +// tests (as the device list will be empty). This means that all tests requiring +// a device will be silently skipped. #define OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(FIXTURE) \ INSTANTIATE_TEST_SUITE_P( \ , FIXTURE, ::testing::ValuesIn(TestEnvironment::getDevices()), \ [](const ::testing::TestParamInfo<TestEnvironment::Device> &info) { \ return SanitizeString(info.param.Name); \ - }) + }); \ + GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(FIXTURE) diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp index 5657320a33a2..8cb0b8065c33 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp @@ -13,6 +13,38 @@ using olGetDeviceInfoTest = OffloadDeviceTest; OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetDeviceInfoTest); +#define OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Dev, \ + Expr) \ + TEST_P(olGetDeviceInfoTest, Test##Dev##TestName) { \ + PropType Value; \ + ASSERT_SUCCESS(olGetDeviceInfo(Dev, PropName, sizeof(Value), &Value)); \ + Expr; \ + } + +#define OL_DEVICE_INFO_TEST_DEVICE_SUCCESS(TestName, PropType, PropName) \ + OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Device, {}) + +#define OL_DEVICE_INFO_TEST_HOST_SUCCESS(TestName, PropType, PropName) \ + OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Host, {}) + +#define OL_DEVICE_INFO_TEST_SUCCESS(TestName, PropType, PropName) \ + OL_DEVICE_INFO_TEST_DEVICE_SUCCESS(TestName, PropType, PropName) \ + OL_DEVICE_INFO_TEST_HOST_SUCCESS(TestName, PropType, PropName) + +#define OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(TestName, PropType, PropName, \ + LowBound) \ + OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Device, \ + ASSERT_GT(Value, LowBound)) + +#define OL_DEVICE_INFO_TEST_HOST_VALUE_GT(TestName, PropType, PropName, \ + LowBound) \ + OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Host, \ + ASSERT_GT(Value, LowBound)) + +#define OL_DEVICE_INFO_TEST_VALUE_GT(TestName, PropType, PropName, LowBound) \ + OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(TestName, PropType, PropName, LowBound) \ + OL_DEVICE_INFO_TEST_HOST_VALUE_GT(TestName, PropType, PropName, LowBound) + TEST_P(olGetDeviceInfoTest, SuccessType) { ol_device_type_t DeviceType; ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_TYPE, @@ -54,6 +86,29 @@ TEST_P(olGetDeviceInfoTest, HostName) { ASSERT_EQ(std::strlen(Name.data()), Size - 1); } +TEST_P(olGetDeviceInfoTest, SuccessProductName) { + size_t Size = 0; + ASSERT_SUCCESS( + olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> Name; + Name.resize(Size); + ASSERT_SUCCESS( + olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data())); + ASSERT_EQ(std::strlen(Name.data()), Size - 1); +} + +TEST_P(olGetDeviceInfoTest, HostProductName) { + size_t Size = 0; + ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> Name; + Name.resize(Size); + ASSERT_SUCCESS( + olGetDeviceInfo(Host, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data())); + ASSERT_EQ(std::strlen(Name.data()), Size - 1); +} + TEST_P(olGetDeviceInfoTest, SuccessVendor) { size_t Size = 0; ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size)); @@ -77,12 +132,8 @@ TEST_P(olGetDeviceInfoTest, SuccessDriverVersion) { ASSERT_EQ(std::strlen(DriverVersion.data()), Size - 1); } -TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSize) { - uint32_t Value; - ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, - sizeof(Value), &Value)); - ASSERT_GT(Value, 0u); -} +OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkGroupSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, 0); TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) { ol_dimensions_t Value{0, 0, 0}; @@ -94,6 +145,59 @@ TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) { ASSERT_GT(Value.z, 0u); } +OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_SIZE, 0); + +TEST_P(olGetDeviceInfoTest, SuccessMaxWorkSizePerDimension) { + ol_dimensions_t Value{0, 0, 0}; + ASSERT_SUCCESS(olGetDeviceInfo(Device, + OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, + sizeof(Value), &Value)); + ASSERT_GT(Value.x, 0u); + ASSERT_GT(Value.y, 0u); + ASSERT_GT(Value.z, 0u); +} + +OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(VendorId, uint32_t, + OL_DEVICE_INFO_VENDOR_ID, 0); +OL_DEVICE_INFO_TEST_HOST_SUCCESS(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID); +OL_DEVICE_INFO_TEST_VALUE_GT(NumComputeUnits, uint32_t, + OL_DEVICE_INFO_NUM_COMPUTE_UNITS, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(SingleFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_SINGLE_FP_CONFIG, 0); +OL_DEVICE_INFO_TEST_SUCCESS(HalfFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_HALF_FP_CONFIG); +OL_DEVICE_INFO_TEST_VALUE_GT(DoubleFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_DOUBLE_FP_CONFIG, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthChar, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthShort, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthInt, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthLong, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthFloat, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthDouble, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE, 0); +OL_DEVICE_INFO_TEST_SUCCESS(NativeVectorWidthHalf, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF); +OL_DEVICE_INFO_TEST_VALUE_GT(MaxClockFrequency, uint32_t, + OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(MemoryClockRate, uint32_t, + OL_DEVICE_INFO_MEMORY_CLOCK_RATE, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(AddressBits, uint32_t, OL_DEVICE_INFO_ADDRESS_BITS, + 0); +OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(MaxMemAllocSize, uint64_t, + OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, 0); +OL_DEVICE_INFO_TEST_HOST_SUCCESS(MaxMemAllocSize, uint64_t, + OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE); +OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t, + OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0); +OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t, + OL_DEVICE_INFO_GLOBAL_MEM_SIZE); + TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) { ol_device_type_t DeviceType; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp index 4e29978fc20f..c4a3c2d5e3c7 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp @@ -13,48 +13,76 @@ using olGetDeviceInfoSizeTest = OffloadDeviceTest; OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetDeviceInfoSizeTest); -TEST_P(olGetDeviceInfoSizeTest, SuccessType) { - size_t Size = 0; - ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_TYPE, &Size)); - ASSERT_EQ(Size, sizeof(ol_device_type_t)); -} +#define OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, Expr) \ + TEST_P(olGetDeviceInfoSizeTest, Success##TestName) { \ + size_t Size = 0; \ + ASSERT_SUCCESS(olGetDeviceInfoSize(Device, PropName, &Size)); \ + Expr; \ + } -TEST_P(olGetDeviceInfoSizeTest, SuccessPlatform) { - size_t Size = 0; - ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PLATFORM, &Size)); - ASSERT_EQ(Size, sizeof(ol_platform_handle_t)); -} +#define OL_DEVICE_INFO_SIZE_TEST_EQ(TestName, PropType, PropName) \ + OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, \ + ASSERT_EQ(Size, sizeof(PropType))); -TEST_P(olGetDeviceInfoSizeTest, SuccessName) { - size_t Size = 0; - ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size)); - ASSERT_NE(Size, 0ul); -} - -TEST_P(olGetDeviceInfoSizeTest, SuccessVendor) { - size_t Size = 0; - ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size)); - ASSERT_NE(Size, 0ul); -} +#define OL_DEVICE_INFO_SIZE_TEST_NONZERO(TestName, PropName) \ + OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, ASSERT_NE(Size, 0ul)); -TEST_P(olGetDeviceInfoSizeTest, SuccessDriverVersion) { - size_t Size = 0; - ASSERT_SUCCESS( - olGetDeviceInfoSize(Device, OL_DEVICE_INFO_DRIVER_VERSION, &Size)); - ASSERT_NE(Size, 0ul); -} +OL_DEVICE_INFO_SIZE_TEST_EQ(Type, ol_device_type_t, OL_DEVICE_INFO_TYPE); +OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t, + OL_DEVICE_INFO_PLATFORM); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION); +OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID); +OL_DEVICE_INFO_SIZE_TEST_EQ(NumComputeUnits, uint32_t, + OL_DEVICE_INFO_NUM_COMPUTE_UNITS); +OL_DEVICE_INFO_SIZE_TEST_EQ(SingleFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_SINGLE_FP_CONFIG); +OL_DEVICE_INFO_SIZE_TEST_EQ(HalfFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_HALF_FP_CONFIG); +OL_DEVICE_INFO_SIZE_TEST_EQ(DoubleFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_DOUBLE_FP_CONFIG); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthChar, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthShort, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthInt, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthLong, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthFloat, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthDouble, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthHalf, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF); +OL_DEVICE_INFO_SIZE_TEST_EQ(MaxClockFrequency, uint32_t, + OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY); +OL_DEVICE_INFO_SIZE_TEST_EQ(MemoryClockRate, uint32_t, + OL_DEVICE_INFO_MEMORY_CLOCK_RATE); +OL_DEVICE_INFO_SIZE_TEST_EQ(AddressBits, uint32_t, OL_DEVICE_INFO_ADDRESS_BITS); +OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t, + OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t, + OL_DEVICE_INFO_GLOBAL_MEM_SIZE); -TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSize) { +TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) { size_t Size = 0; - ASSERT_SUCCESS( - olGetDeviceInfoSize(Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, &Size)); - ASSERT_EQ(Size, sizeof(uint32_t)); + ASSERT_SUCCESS(olGetDeviceInfoSize( + Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, &Size)); + ASSERT_EQ(Size, sizeof(ol_dimensions_t)); + ASSERT_EQ(Size, sizeof(uint32_t) * 3); } -TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) { +TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkSizePerDimension) { size_t Size = 0; ASSERT_SUCCESS(olGetDeviceInfoSize( - Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, &Size)); + Device, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, &Size)); ASSERT_EQ(Size, sizeof(ol_dimensions_t)); ASSERT_EQ(Size, sizeof(uint32_t) * 3); } diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt index 50e430597e64..1a042e1b3831 100644 --- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt @@ -2,6 +2,7 @@ add_offload_test_device_code(foo.cpp foo) add_offload_test_device_code(bar.cpp bar) # Compile with optimizations to eliminate AMDGPU implicit arguments. add_offload_test_device_code(noargs.cpp noargs -O3) +add_offload_test_device_code(byte.cpp byte) add_offload_test_device_code(localmem.cpp localmem) add_offload_test_device_code(localmem_reduction.cpp localmem_reduction) add_offload_test_device_code(localmem_static.cpp localmem_static) @@ -14,6 +15,7 @@ add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin + byte.bin localmem.bin localmem_reduction.bin localmem_static.bin diff --git a/offload/unittests/OffloadAPI/device_code/byte.cpp b/offload/unittests/OffloadAPI/device_code/byte.cpp new file mode 100644 index 000000000000..779d120fefca --- /dev/null +++ b/offload/unittests/OffloadAPI/device_code/byte.cpp @@ -0,0 +1,3 @@ +#include <gpuintrin.h> + +extern "C" __gpu_kernel void byte(unsigned char c) { (void)c; } diff --git a/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp b/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp index 908d2dcb6df5..b86d15f045eb 100644 --- a/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp +++ b/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp @@ -13,13 +13,22 @@ using olGetEventInfoTest = OffloadEventTest; OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetEventInfoTest); -TEST_P(olGetEventInfoTest, SuccessDevice) { +TEST_P(olGetEventInfoTest, SuccessQueue) { ol_queue_handle_t RetrievedQueue; ASSERT_SUCCESS(olGetEventInfo(Event, OL_EVENT_INFO_QUEUE, sizeof(ol_queue_handle_t), &RetrievedQueue)); ASSERT_EQ(Queue, RetrievedQueue); } +TEST_P(olGetEventInfoTest, SuccessIsComplete) { + bool Complete = false; + while (!Complete) { + ASSERT_SUCCESS(olGetEventInfo(Event, OL_EVENT_INFO_IS_COMPLETE, + sizeof(Complete), &Complete)); + } + ASSERT_EQ(Complete, true); +} + TEST_P(olGetEventInfoTest, InvalidNullHandle) { ol_queue_handle_t RetrievedQueue; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, diff --git a/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp b/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp index d7dee58e35e8..36f36c3a187f 100644 --- a/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp +++ b/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp @@ -19,6 +19,12 @@ TEST_P(olGetEventInfoSizeTest, SuccessQueue) { ASSERT_EQ(Size, sizeof(ol_queue_handle_t)); } +TEST_P(olGetEventInfoSizeTest, SuccessIsComplete) { + size_t Size = 0; + ASSERT_SUCCESS(olGetEventInfoSize(Event, OL_EVENT_INFO_IS_COMPLETE, &Size)); + ASSERT_EQ(Size, sizeof(bool)); +} + TEST_P(olGetEventInfoSizeTest, InvalidNullHandle) { size_t Size = 0; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, diff --git a/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp new file mode 100644 index 000000000000..17fa383cac3f --- /dev/null +++ b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp @@ -0,0 +1,45 @@ +//===------- Offload API tests - olCalculateOptimalOccupancy --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include <OffloadAPI.h> +#include <gtest/gtest.h> + +using olCalculateOptimalOccupancyTest = OffloadKernelTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olCalculateOptimalOccupancyTest); + +TEST_P(olCalculateOptimalOccupancyTest, Success) { + size_t Size{0}; + ASSERT_SUCCESS_OR_UNSUPPORTED( + olCalculateOptimalOccupancy(Device, Kernel, 0, &Size)); + ASSERT_GT(Size, 0u); +} + +TEST_P(olCalculateOptimalOccupancyTest, SuccessMem) { + size_t Size{0}; + ASSERT_SUCCESS_OR_UNSUPPORTED( + olCalculateOptimalOccupancy(Device, Kernel, 1024, &Size)); + ASSERT_GT(Size, 0u); +} + +TEST_P(olCalculateOptimalOccupancyTest, NullKernel) { + size_t Size; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olCalculateOptimalOccupancy(Device, nullptr, 0, &Size)); +} + +TEST_P(olCalculateOptimalOccupancyTest, NullDevice) { + size_t Size; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olCalculateOptimalOccupancy(nullptr, Kernel, 0, &Size)); +} + +TEST_P(olCalculateOptimalOccupancyTest, NullOutput) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olCalculateOptimalOccupancy(Device, Kernel, 0, nullptr)); +} diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp index 1dac8c50271b..c9eca36a4d44 100644 --- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp +++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp @@ -55,6 +55,7 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase { KERNEL_TEST(Foo, foo) KERNEL_TEST(NoArgs, noargs) +KERNEL_TEST(Byte, byte) KERNEL_TEST(LocalMem, localmem) KERNEL_TEST(LocalMemReduction, localmem_reduction) KERNEL_TEST(LocalMemStatic, localmem_static) diff --git a/offload/unittests/OffloadAPI/memory/olGetMemInfo.cpp b/offload/unittests/OffloadAPI/memory/olGetMemInfo.cpp new file mode 100644 index 000000000000..a4b382ff298a --- /dev/null +++ b/offload/unittests/OffloadAPI/memory/olGetMemInfo.cpp @@ -0,0 +1,130 @@ +//===------- Offload API tests - olGetMemInfo -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include <OffloadAPI.h> +#include <gtest/gtest.h> + +constexpr size_t SIZE = 1024; + +struct olGetMemInfoBaseTest : OffloadDeviceTest { + void *OffsetPtr() { return &reinterpret_cast<char *>(Ptr)[123]; } + + void *Ptr; +}; + +template <ol_alloc_type_t AllocType> +struct olGetMemInfoTest : olGetMemInfoBaseTest { + void SetUp() override { + RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::SetUp()); + ASSERT_SUCCESS(olMemAlloc(Device, AllocType, SIZE, &Ptr)); + } + + void TearDown() override { + ASSERT_SUCCESS(olMemFree(Ptr)); + RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::TearDown()); + } +}; +using olGetMemInfoDeviceTest = olGetMemInfoTest<OL_ALLOC_TYPE_DEVICE>; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoDeviceTest); +using olGetMemInfoManagedTest = olGetMemInfoTest<OL_ALLOC_TYPE_MANAGED>; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoManagedTest); +using olGetMemInfoHostTest = olGetMemInfoTest<OL_ALLOC_TYPE_HOST>; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoHostTest); + +#define PER_ALLOC_TEST(FUNCTION) \ + TEST_P(olGetMemInfoDeviceTest, FUNCTION) { \ + FUNCTION(this, Ptr, OL_ALLOC_TYPE_DEVICE); \ + } \ + TEST_P(olGetMemInfoManagedTest, FUNCTION) { \ + FUNCTION(this, Ptr, OL_ALLOC_TYPE_MANAGED); \ + } \ + TEST_P(olGetMemInfoHostTest, FUNCTION) { \ + FUNCTION(this, OffsetPtr(), OL_ALLOC_TYPE_HOST); \ + } \ + TEST_P(olGetMemInfoDeviceTest, FUNCTION##Offset) { \ + FUNCTION(this, Ptr, OL_ALLOC_TYPE_DEVICE); \ + } \ + TEST_P(olGetMemInfoManagedTest, FUNCTION##Offset) { \ + FUNCTION(this, OffsetPtr(), OL_ALLOC_TYPE_MANAGED); \ + } \ + TEST_P(olGetMemInfoHostTest, FUNCTION##Offset) { \ + FUNCTION(this, OffsetPtr(), OL_ALLOC_TYPE_HOST); \ + } + +void SuccessDevice(olGetMemInfoBaseTest *Fixture, void *Ptr, + ol_alloc_type_t Type) { + ol_device_handle_t RetrievedDevice; + ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_DEVICE, + sizeof(RetrievedDevice), &RetrievedDevice)); + ASSERT_EQ(RetrievedDevice, Fixture->Device); +} +PER_ALLOC_TEST(SuccessDevice); + +void SuccessBase(olGetMemInfoBaseTest *Fixture, void *Ptr, + ol_alloc_type_t Type) { + void *RetrievedBase; + ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_BASE, + sizeof(RetrievedBase), &RetrievedBase)); + ASSERT_EQ(RetrievedBase, Fixture->Ptr); +} +PER_ALLOC_TEST(SuccessBase); + +void SuccessSize(olGetMemInfoBaseTest *Fixture, void *Ptr, + ol_alloc_type_t Type) { + size_t RetrievedSize; + ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_SIZE, + sizeof(RetrievedSize), &RetrievedSize)); + ASSERT_EQ(RetrievedSize, SIZE); +} +PER_ALLOC_TEST(SuccessSize); + +void SuccessType(olGetMemInfoBaseTest *Fixture, void *Ptr, + ol_alloc_type_t Type) { + ol_alloc_type_t RetrievedType; + ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_TYPE, + sizeof(RetrievedType), &RetrievedType)); + ASSERT_EQ(RetrievedType, Type); +} +PER_ALLOC_TEST(SuccessType); + +TEST_P(olGetMemInfoDeviceTest, InvalidNotFound) { + // Assuming that we aren't unlucky and happen to get 0x1234 as a random + // pointer + void *RetrievedBase; + ASSERT_ERROR(OL_ERRC_NOT_FOUND, + olGetMemInfo(reinterpret_cast<void *>(0x1234), OL_MEM_INFO_BASE, + sizeof(RetrievedBase), &RetrievedBase)); +} + +TEST_P(olGetMemInfoDeviceTest, InvalidNullPtr) { + ol_device_handle_t RetrievedDevice; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olGetMemInfo(nullptr, OL_MEM_INFO_DEVICE, + sizeof(RetrievedDevice), &RetrievedDevice)); +} + +TEST_P(olGetMemInfoDeviceTest, InvalidSizeZero) { + ol_device_handle_t RetrievedDevice; + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olGetMemInfo(Ptr, OL_MEM_INFO_DEVICE, 0, &RetrievedDevice)); +} + +TEST_P(olGetMemInfoDeviceTest, InvalidSizeSmall) { + ol_device_handle_t RetrievedDevice; + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olGetMemInfo(Ptr, OL_MEM_INFO_DEVICE, + sizeof(RetrievedDevice) - 1, &RetrievedDevice)); +} + +TEST_P(olGetMemInfoDeviceTest, InvalidNullPointerPropValue) { + ol_device_handle_t RetrievedDevice; + ASSERT_ERROR( + OL_ERRC_INVALID_NULL_POINTER, + olGetMemInfo(Ptr, OL_MEM_INFO_DEVICE, sizeof(RetrievedDevice), nullptr)); +} diff --git a/offload/unittests/OffloadAPI/memory/olGetMemInfoSize.cpp b/offload/unittests/OffloadAPI/memory/olGetMemInfoSize.cpp new file mode 100644 index 000000000000..f1a1e790fb22 --- /dev/null +++ b/offload/unittests/OffloadAPI/memory/olGetMemInfoSize.cpp @@ -0,0 +1,63 @@ +//===------- Offload API tests - olGetMemInfoSize -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <OffloadAPI.h> + +#include "../common/Fixtures.hpp" + +struct olGetMemInfoSizeTest : OffloadDeviceTest { + void *OffsetPtr() { return &reinterpret_cast<char *>(Ptr)[123]; } + + void SetUp() override { + RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::SetUp()); + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 0x1024, &Ptr)); + } + + void TearDown() override { + ASSERT_SUCCESS(olMemFree(Ptr)); + RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::TearDown()); + } + + void *Ptr; +}; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoSizeTest); + +TEST_P(olGetMemInfoSizeTest, SuccessDevice) { + size_t Size = 0; + ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_DEVICE, &Size)); + ASSERT_EQ(Size, sizeof(ol_device_handle_t)); +} + +TEST_P(olGetMemInfoSizeTest, SuccessBase) { + size_t Size = 0; + ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_BASE, &Size)); + ASSERT_EQ(Size, sizeof(void *)); +} + +TEST_P(olGetMemInfoSizeTest, SuccessSize) { + size_t Size = 0; + ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_SIZE, &Size)); + ASSERT_EQ(Size, sizeof(size_t)); +} + +TEST_P(olGetMemInfoSizeTest, SuccessType) { + size_t Size = 0; + ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_TYPE, &Size)); + ASSERT_EQ(Size, sizeof(ol_alloc_type_t)); +} + +TEST_P(olGetMemInfoSizeTest, InvalidSymbolInfoEnumeration) { + size_t Size = 0; + ASSERT_ERROR(OL_ERRC_INVALID_ENUMERATION, + olGetMemInfoSize(Ptr, OL_MEM_INFO_FORCE_UINT32, &Size)); +} + +TEST_P(olGetMemInfoSizeTest, InvalidNullPointer) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olGetMemInfoSize(Ptr, OL_MEM_INFO_DEVICE, nullptr)); +} diff --git a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp index 00e428ec2abc..445262aa0c58 100644 --- a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp +++ b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp @@ -34,6 +34,26 @@ TEST_P(olMemAllocTest, SuccessAllocDevice) { olMemFree(Alloc); } +TEST_P(olMemAllocTest, SuccessAllocMany) { + std::vector<void *> Allocs; + Allocs.reserve(1000); + + constexpr ol_alloc_type_t TYPES[3] = { + OL_ALLOC_TYPE_DEVICE, OL_ALLOC_TYPE_MANAGED, OL_ALLOC_TYPE_HOST}; + + for (size_t I = 1; I < 1000; I++) { + void *Alloc = nullptr; + ASSERT_SUCCESS(olMemAlloc(Device, TYPES[I % 3], 1024 * I, &Alloc)); + ASSERT_NE(Alloc, nullptr); + + Allocs.push_back(Alloc); + } + + for (auto *A : Allocs) { + olMemFree(A); + } +} + TEST_P(olMemAllocTest, InvalidNullDevice) { void *Alloc = nullptr; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp new file mode 100644 index 000000000000..a84ed3d78ecc --- /dev/null +++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp @@ -0,0 +1,193 @@ +//===------- Offload API tests - olMemFill --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include <OffloadAPI.h> +#include <gtest/gtest.h> + +struct olMemFillTest : OffloadQueueTest { + template <typename PatternTy, PatternTy PatternVal, size_t Size, + bool Block = false> + void test_body() { + ManuallyTriggeredTask Manual; + + // Block/enqueue tests ensure that the test has been enqueued to a queue + // (rather than being done synchronously if the queue happens to be empty) + if constexpr (Block) { + ASSERT_SUCCESS(Manual.enqueue(Queue)); + } + + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + PatternTy Pattern = PatternVal; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + if constexpr (Block) { + ASSERT_SUCCESS(Manual.trigger()); + } + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternTy *AllocPtr = reinterpret_cast<PatternTy *>(Alloc); + ASSERT_EQ(AllocPtr[i], Pattern); + } + + olMemFree(Alloc); + } +}; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest); + +TEST_P(olMemFillTest, Success8) { test_body<uint8_t, 0x42, 1024>(); } +TEST_P(olMemFillTest, Success8NotMultiple4) { + test_body<uint8_t, 0x42, 1023>(); +} +TEST_P(olMemFillTest, Success8Enqueue) { + test_body<uint8_t, 0x42, 1024, true>(); +} +TEST_P(olMemFillTest, Success8NotMultiple4Enqueue) { + test_body<uint8_t, 0x42, 1023, true>(); +} + +TEST_P(olMemFillTest, Success16) { test_body<uint8_t, 0x42, 1024>(); } +TEST_P(olMemFillTest, Success16NotMultiple4) { + test_body<uint16_t, 0x4243, 1022>(); +} +TEST_P(olMemFillTest, Success16Enqueue) { + test_body<uint8_t, 0x42, 1024, true>(); +} +TEST_P(olMemFillTest, Success16NotMultiple4Enqueue) { + test_body<uint16_t, 0x4243, 1022, true>(); +} + +TEST_P(olMemFillTest, Success32) { test_body<uint32_t, 0xDEADBEEF, 1024>(); } +TEST_P(olMemFillTest, Success32Enqueue) { + test_body<uint32_t, 0xDEADBEEF, 1024, true>(); +} + +TEST_P(olMemFillTest, SuccessLarge) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct PatternT { + uint64_t A; + uint64_t B; + } Pattern{UINT64_MAX, UINT64_MAX}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLargeEnqueue) { + constexpr size_t Size = 1024; + void *Alloc; + ManuallyTriggeredTask Manual; + ASSERT_SUCCESS(Manual.enqueue(Queue)); + + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct PatternT { + uint64_t A; + uint64_t B; + } Pattern{UINT64_MAX, UINT64_MAX}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + Manual.trigger(); + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLargeByteAligned) { + constexpr size_t Size = 17 * 64; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct __attribute__((packed)) PatternT { + uint64_t A; + uint64_t B; + uint8_t C; + } Pattern{UINT64_MAX, UINT64_MAX, 255}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].C, 255); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLargeByteAlignedEnqueue) { + constexpr size_t Size = 17 * 64; + void *Alloc; + ManuallyTriggeredTask Manual; + ASSERT_SUCCESS(Manual.enqueue(Queue)); + + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct __attribute__((packed)) PatternT { + uint64_t A; + uint64_t B; + uint8_t C; + } Pattern{UINT64_MAX, UINT64_MAX, 255}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + Manual.trigger(); + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].C, 255); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, InvalidPatternSize) { + constexpr size_t Size = 1025; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint16_t Pattern = 0x4242; + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + olMemFree(Alloc); +} diff --git a/offload/unittests/OffloadAPI/program/olIsValidBinary.cpp b/offload/unittests/OffloadAPI/program/olIsValidBinary.cpp new file mode 100644 index 000000000000..02e805dd1135 --- /dev/null +++ b/offload/unittests/OffloadAPI/program/olIsValidBinary.cpp @@ -0,0 +1,49 @@ +//===------- Offload API tests - olIsValidBinary --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include <OffloadAPI.h> +#include <gtest/gtest.h> + +using olIsValidBinaryTest = OffloadDeviceTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olIsValidBinaryTest); + +TEST_P(olIsValidBinaryTest, Success) { + + std::unique_ptr<llvm::MemoryBuffer> DeviceBin; + ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin)); + ASSERT_GE(DeviceBin->getBufferSize(), 0lu); + + bool IsValid = false; + ASSERT_SUCCESS(olIsValidBinary(Device, DeviceBin->getBufferStart(), + DeviceBin->getBufferSize(), &IsValid)); + ASSERT_TRUE(IsValid); + + ASSERT_SUCCESS( + olIsValidBinary(Device, DeviceBin->getBufferStart(), 0, &IsValid)); + ASSERT_FALSE(IsValid); +} + +TEST_P(olIsValidBinaryTest, Invalid) { + + std::unique_ptr<llvm::MemoryBuffer> DeviceBin; + ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin)); + ASSERT_GE(DeviceBin->getBufferSize(), 0lu); + + bool IsValid = false; + ASSERT_SUCCESS( + olIsValidBinary(Device, DeviceBin->getBufferStart(), 0, &IsValid)); + ASSERT_FALSE(IsValid); +} + +TEST_P(olIsValidBinaryTest, NullPointer) { + bool IsValid = false; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olIsValidBinary(Device, nullptr, 42, &IsValid)); + ASSERT_FALSE(IsValid); +} diff --git a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp index 0dc8527df532..aa9e372ede2c 100644 --- a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp +++ b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp @@ -18,6 +18,15 @@ TEST_P(olDestroyQueueTest, Success) { Queue = nullptr; } +TEST_P(olDestroyQueueTest, SuccessDelayedResolution) { + ManuallyTriggeredTask Manual; + ASSERT_SUCCESS(Manual.enqueue(Queue)); + ASSERT_SUCCESS(olDestroyQueue(Queue)); + Queue = nullptr; + + ASSERT_SUCCESS(Manual.trigger()); +} + TEST_P(olDestroyQueueTest, InvalidNullHandle) { ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, olDestroyQueue(nullptr)); } diff --git a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp new file mode 100644 index 000000000000..aa86750f6adf --- /dev/null +++ b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp @@ -0,0 +1,107 @@ +//===------- Offload API tests - olLaunchHostFunction ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include <OffloadAPI.h> +#include <gtest/gtest.h> +#include <thread> + +struct olLaunchHostFunctionTest : OffloadQueueTest {}; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchHostFunctionTest); + +struct olLaunchHostFunctionKernelTest : OffloadKernelTest {}; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchHostFunctionKernelTest); + +TEST_P(olLaunchHostFunctionTest, Success) { + ASSERT_SUCCESS(olLaunchHostFunction(Queue, [](void *) {}, nullptr)); +} + +TEST_P(olLaunchHostFunctionTest, SuccessSequence) { + uint32_t Buff[16] = {1, 1}; + + for (auto BuffPtr = &Buff[2]; BuffPtr != &Buff[16]; BuffPtr++) { + ASSERT_SUCCESS(olLaunchHostFunction( + Queue, + [](void *BuffPtr) { + uint32_t *AsU32 = reinterpret_cast<uint32_t *>(BuffPtr); + AsU32[0] = AsU32[-1] + AsU32[-2]; + }, + BuffPtr)); + } + + ASSERT_SUCCESS(olSyncQueue(Queue)); + + for (uint32_t i = 2; i < 16; i++) { + ASSERT_EQ(Buff[i], Buff[i - 1] + Buff[i - 2]); + } +} + +TEST_P(olLaunchHostFunctionKernelTest, SuccessBlocking) { + // Verify that a host kernel can block execution - A host task is created that + // only resolves when Block is set to false. + ol_kernel_launch_size_args_t LaunchArgs; + LaunchArgs.Dimensions = 1; + LaunchArgs.GroupSize = {64, 1, 1}; + LaunchArgs.NumGroups = {1, 1, 1}; + LaunchArgs.DynSharedMemory = 0; + + ol_queue_handle_t Queue; + ASSERT_SUCCESS(olCreateQueue(Device, &Queue)); + + void *Mem; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, + LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem)); + + uint32_t *Data = (uint32_t *)Mem; + for (uint32_t i = 0; i < 64; i++) { + Data[i] = 0; + } + + volatile bool Block = true; + ASSERT_SUCCESS(olLaunchHostFunction( + Queue, + [](void *Ptr) { + volatile bool *Block = + reinterpret_cast<volatile bool *>(reinterpret_cast<bool *>(Ptr)); + + while (*Block) + std::this_thread::yield(); + }, + const_cast<bool *>(&Block))); + + struct { + void *Mem; + } Args{Mem}; + ASSERT_SUCCESS( + olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs)); + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + for (uint32_t i = 0; i < 64; i++) { + ASSERT_EQ(Data[i], 0); + } + + Block = false; + ASSERT_SUCCESS(olSyncQueue(Queue)); + + for (uint32_t i = 0; i < 64; i++) { + ASSERT_EQ(Data[i], i); + } + + ASSERT_SUCCESS(olDestroyQueue(Queue)); + ASSERT_SUCCESS(olMemFree(Mem)); +} + +TEST_P(olLaunchHostFunctionTest, InvalidNullCallback) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olLaunchHostFunction(Queue, nullptr, nullptr)); +} + +TEST_P(olLaunchHostFunctionTest, InvalidNullQueue) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olLaunchHostFunction(nullptr, [](void *) {}, nullptr)); +} |
