diff options
Diffstat (limited to 'offload/plugins-nextgen/common/include')
| -rw-r--r-- | offload/plugins-nextgen/common/include/DLWrap.h | 286 | ||||
| -rw-r--r-- | offload/plugins-nextgen/common/include/GlobalHandler.h | 174 | ||||
| -rw-r--r-- | offload/plugins-nextgen/common/include/JIT.h | 125 | ||||
| -rw-r--r-- | offload/plugins-nextgen/common/include/MemoryManager.h | 347 | ||||
| -rw-r--r-- | offload/plugins-nextgen/common/include/PluginInterface.h | 1537 | ||||
| -rw-r--r-- | offload/plugins-nextgen/common/include/RPC.h | 69 | ||||
| -rw-r--r-- | offload/plugins-nextgen/common/include/Utils/ELF.h | 44 |
7 files changed, 2582 insertions, 0 deletions
diff --git a/offload/plugins-nextgen/common/include/DLWrap.h b/offload/plugins-nextgen/common/include/DLWrap.h new file mode 100644 index 000000000000..8934e7e70102 --- /dev/null +++ b/offload/plugins-nextgen/common/include/DLWrap.h @@ -0,0 +1,286 @@ +//===-- Shared/DLWrap.h - Convenience wrapper for dlopen/dlsym --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The openmp plugins depend on extern libraries. These can be used via: +// - bitcode file statically linked +// - (relocatable) object file statically linked +// - static library +// - dynamic library, linked at build time +// - dynamic library, loaded at application run time by dlopen +// +// This file factors out most boilerplate for using a dlopened library. +// - Function symbols are generated that are statically linked against +// - The dlopen can be done implicitly when initializing the library +// - dlsym lookups are done once and cached +// - The abstraction is very thin to permit varied uses of the library +// +// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to: +// int foo(char x0, double x1, void* x2) { +// constexpr size_t index = id(); +// void * dlsymResult = pointer(index); +// return ((int (*)(char, double, void*))dlsymResult)(x0, x1, x2); +// } +// +// Multiple calls to DLWRAP(symbol_name, arity) with bespoke +// initialization code that can use the thin abstraction: +// namespace dlwrap { +// static size_t size(); +// static const char *symbol(size_t); +// static void **pointer(size_t); +// } +// will compile to an object file that only exposes the symbols that the +// dynamic library would do, with the right function types. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SHARED_DLWRAP_H +#define OMPTARGET_SHARED_DLWRAP_H + +#include <array> +#include <cstddef> +#include <tuple> +#include <type_traits> + +// Where symbol is a function, these expand to some book keeping and an +// implementation of that function +#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY) +#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) + +// For example, given a prototype: +// int foo(char, double); +// +// DLWRAP(foo, 2) expands to: +// +// namespace dlwrap { +// struct foo_Trait : public dlwrap::trait<decltype(&foo)> { +// using T = dlwrap::trait<decltype(&foo)>; +// static T::FunctionType get() { +// constexpr size_t Index = getIndex(); +// void *P = *dlwrap::pointer(Index); +// return reinterpret_cast<T::FunctionType>(P); +// } +// }; +// } +// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); } +// +// DLWRAP_INTERNAL is similar, except the function it expands to is: +// static int dlwrap_foo(char x0, double x1) { ... } +// so that the function pointer call can be wrapped in library-specific code +// +// DLWRAP_INITIALIZE() declares static functions: +#define DLWRAP_INITIALIZE() \ + namespace dlwrap { \ + static size_t size(); \ + static const char *symbol(size_t); /* get symbol name in [0, size()) */ \ + static void ** \ + pointer(size_t); /* get pointer to function pointer in [0, size()) */ \ + } + +// DLWRAP_FINALIZE() implements the functions from DLWRAP_INITIALIZE +#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL() + +// Implementation details follow. + +namespace dlwrap { + +// Extract return / argument types from address of function symbol +template <typename F> struct trait; +template <typename R, typename... Ts> struct trait<R (*)(Ts...)> { + constexpr static const size_t nargs = sizeof...(Ts); + typedef R ReturnType; + template <size_t i> struct arg { + typedef typename std::tuple_element<i, std::tuple<Ts...>>::type type; + }; + + typedef R (*FunctionType)(Ts...); +}; + +namespace type { +// Book keeping is by type specialization + +template <size_t S> struct count { + static constexpr size_t N = count<S - 1>::N; +}; + +template <> struct count<0> { static constexpr size_t N = 0; }; + +// Get a constexpr size_t ID, starts at zero +#define DLWRAP_ID() (dlwrap::type::count<__LINE__>::N) + +// Increment value returned by DLWRAP_ID +#define DLWRAP_INC() \ + template <> struct dlwrap::type::count<__LINE__> { \ + static constexpr size_t N = 1 + dlwrap::type::count<__LINE__ - 1>::N; \ + } + +template <size_t N> struct symbol; +#define DLWRAP_SYMBOL(SYMBOL, ID) \ + template <> struct dlwrap::type::symbol<ID> { \ + static constexpr const char *call() { return #SYMBOL; } \ + } +} // namespace type + +template <size_t N, size_t... Is> +constexpr std::array<const char *, N> static getSymbolArray( + std::index_sequence<Is...>) { + return {{dlwrap::type::symbol<Is>::call()...}}; +} + +template <size_t Requested, size_t Required> constexpr void verboseAssert() { + static_assert(Requested == Required, "Arity Error"); +} + +} // namespace dlwrap + +#define DLWRAP_INSTANTIATE(SYM_DEF, SYM_USE, ARITY) \ + DLWRAP_INSTANTIATE_##ARITY(SYM_DEF, SYM_USE, \ + dlwrap::trait<decltype(&SYM_USE)>) + +#define DLWRAP_FINALIZE_IMPL() \ + static size_t dlwrap::size() { return DLWRAP_ID(); } \ + static const char *dlwrap::symbol(size_t i) { \ + static constexpr const std::array<const char *, DLWRAP_ID()> \ + dlwrap_symbols = getSymbolArray<DLWRAP_ID()>( \ + std::make_index_sequence<DLWRAP_ID()>()); \ + return dlwrap_symbols[i]; \ + } \ + static void **dlwrap::pointer(size_t i) { \ + static std::array<void *, DLWRAP_ID()> dlwrap_pointers; \ + return &dlwrap_pointers.data()[i]; \ + } + +#define DLWRAP_COMMON(SYMBOL, ARITY) \ + DLWRAP_INC(); \ + DLWRAP_SYMBOL(SYMBOL, DLWRAP_ID() - 1); \ + namespace dlwrap { \ + struct SYMBOL##_Trait : public dlwrap::trait<decltype(&SYMBOL)> { \ + using T = dlwrap::trait<decltype(&SYMBOL)>; \ + static T::FunctionType get() { \ + verboseAssert<ARITY, trait<decltype(&SYMBOL)>::nargs>(); \ + constexpr size_t Index = DLWRAP_ID() - 1; \ + void *P = *dlwrap::pointer(Index); \ + return reinterpret_cast<T::FunctionType>(P); \ + } \ + }; \ + } + +#define DLWRAP_IMPL(SYMBOL, ARITY) \ + DLWRAP_COMMON(SYMBOL, ARITY) \ + DLWRAP_INSTANTIATE(SYMBOL, SYMBOL, ARITY) + +#define DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) \ + DLWRAP_COMMON(SYMBOL, ARITY) \ + static DLWRAP_INSTANTIATE(dlwrap_##SYMBOL, SYMBOL, ARITY) + +#define DLWRAP_INSTANTIATE_0(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF() { return dlwrap::SYM_USE##_Trait::get()(); } +#define DLWRAP_INSTANTIATE_1(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0) { \ + return dlwrap::SYM_USE##_Trait::get()(x0); \ + } +#define DLWRAP_INSTANTIATE_2(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1); \ + } +#define DLWRAP_INSTANTIATE_3(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2); \ + } +#define DLWRAP_INSTANTIATE_4(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3); \ + } +#define DLWRAP_INSTANTIATE_5(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4); \ + } +#define DLWRAP_INSTANTIATE_6(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5); \ + } + +#define DLWRAP_INSTANTIATE_7(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6); \ + } + +#define DLWRAP_INSTANTIATE_8(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6, \ + typename T::template arg<7>::type x7) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7); \ + } +#define DLWRAP_INSTANTIATE_9(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6, \ + typename T::template arg<7>::type x7, \ + typename T::template arg<8>::type x8) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8); \ + } +#define DLWRAP_INSTANTIATE_10(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6, \ + typename T::template arg<7>::type x7, \ + typename T::template arg<8>::type x8, \ + typename T::template arg<9>::type x9) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \ + x9); \ + } +#define DLWRAP_INSTANTIATE_11(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6, \ + typename T::template arg<7>::type x7, \ + typename T::template arg<8>::type x8, \ + typename T::template arg<9>::type x9, \ + typename T::template arg<10>::type x10) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \ + x9, x10); \ + } + +#endif // OMPTARGET_SHARED_DLWRAP_H diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h new file mode 100644 index 000000000000..829b4b729119 --- /dev/null +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -0,0 +1,174 @@ +//===- GlobalHandler.h - Target independent global & enviroment handling --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target independent global handler and environment manager. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H +#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H + +#include <string> + +#include "llvm/ADT/DenseMap.h" +#include "llvm/Object/ELFObjectFile.h" + +#include "Shared/Debug.h" +#include "Shared/Utils.h" + +#include "omptarget.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +class DeviceImageTy; +struct GenericDeviceTy; + +using namespace llvm::object; + +/// Common abstraction for globals that live on the host and device. +/// It simply encapsulates the symbol name, symbol size, and symbol address +/// (which might be host or device depending on the context). +class GlobalTy { + // NOTE: Maybe we can have a pointer to the offload entry name instead of + // holding a private copy of the name as a std::string. + std::string Name; + uint32_t Size; + void *Ptr; + +public: + GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr) + : Name(Name), Size(Size), Ptr(Ptr) {} + + const std::string &getName() const { return Name; } + uint32_t getSize() const { return Size; } + void *getPtr() const { return Ptr; } + + void setSize(int32_t S) { Size = S; } + void setPtr(void *P) { Ptr = P; } +}; + +/// Subclass of GlobalTy that holds the memory for a global of \p Ty. +template <typename Ty> class StaticGlobalTy : public GlobalTy { + Ty Data; + +public: + template <typename... Args> + StaticGlobalTy(const std::string &Name, Args &&...args) + : GlobalTy(Name, sizeof(Ty), &Data), + Data(Ty{std::forward<Args>(args)...}) {} + + template <typename... Args> + StaticGlobalTy(const char *Name, Args &&...args) + : GlobalTy(Name, sizeof(Ty), &Data), + Data(Ty{std::forward<Args>(args)...}) {} + + template <typename... Args> + StaticGlobalTy(const char *Name, const char *Suffix, Args &&...args) + : GlobalTy(std::string(Name) + Suffix, sizeof(Ty), &Data), + Data(Ty{std::forward<Args>(args)...}) {} + + Ty &getValue() { return Data; } + const Ty &getValue() const { return Data; } + void setValue(const Ty &V) { Data = V; } +}; + +/// Helper class to do the heavy lifting when it comes to moving globals between +/// host and device. Through the GenericDeviceTy we access memcpy DtoH and HtoD, +/// which means the only things specialized by the subclass is the retrival of +/// global metadata (size, addr) from the device. +/// \see getGlobalMetadataFromDevice +class GenericGlobalHandlerTy { + /// Actually move memory between host and device. See readGlobalFromDevice and + /// writeGlobalToDevice for the interface description. + Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal, + bool Device2Host); + + /// Actually move memory between host and device. See readGlobalFromDevice and + /// writeGlobalToDevice for the interface description. + Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device, + const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal, + bool Device2Host); + +public: + virtual ~GenericGlobalHandlerTy() {} + + /// Helper function for getting an ELF from a device image. + Expected<std::unique_ptr<ObjectFile>> getELFObjectFile(DeviceImageTy &Image); + + /// Returns whether the symbol named \p SymName is present in the given \p + /// Image. + bool isSymbolInImage(GenericDeviceTy &Device, DeviceImageTy &Image, + StringRef SymName); + + /// Get the address and size of a global in the image. Address and size are + /// return in \p ImageGlobal, the global name is passed in \p ImageGlobal. + Error getGlobalMetadataFromImage(GenericDeviceTy &Device, + DeviceImageTy &Image, GlobalTy &ImageGlobal); + + /// Read the memory associated with a global from the image and store it on + /// the host. The name, size, and destination are defined by \p HostGlobal. + Error readGlobalFromImage(GenericDeviceTy &Device, DeviceImageTy &Image, + const GlobalTy &HostGlobal); + + /// Get the address and size of a global from the device. Address is return in + /// \p DeviceGlobal, the global name and expected size are passed in + /// \p DeviceGlobal. + virtual Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + GlobalTy &DeviceGlobal) = 0; + + /// Copy the memory associated with a global from the device to its + /// counterpart on the host. The name, size, and destination are defined by + /// \p HostGlobal. The origin is defined by \p DeviceGlobal. + Error readGlobalFromDevice(GenericDeviceTy &Device, + const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, HostGlobal, DeviceGlobal, + /*D2H=*/true); + } + + /// Copy the memory associated with a global from the device to its + /// counterpart on the host. The name, size, and destination are defined by + /// \p HostGlobal. The origin is automatically resolved. + Error readGlobalFromDevice(GenericDeviceTy &Device, DeviceImageTy &Image, + const GlobalTy &HostGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, + /*D2H=*/true); + } + + /// Copy the memory associated with a global from the host to its counterpart + /// on the device. The name, size, and origin are defined by \p HostGlobal. + /// The destination is defined by \p DeviceGlobal. + Error writeGlobalToDevice(GenericDeviceTy &Device, const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, HostGlobal, DeviceGlobal, + /*D2H=*/false); + } + + /// Copy the memory associated with a global from the host to its counterpart + /// on the device. The name, size, and origin are defined by \p HostGlobal. + /// The destination is automatically resolved. + Error writeGlobalToDevice(GenericDeviceTy &Device, DeviceImageTy &Image, + const GlobalTy &HostGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, + /*D2H=*/false); + } +}; + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h new file mode 100644 index 000000000000..b22197b89208 --- /dev/null +++ b/offload/plugins-nextgen/common/include/JIT.h @@ -0,0 +1,125 @@ +//===- JIT.h - Target independent JIT infrastructure ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H + +#include "Shared/EnvironmentVar.h" +#include "Shared/Utils.h" + +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Error.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" + +#include <functional> +#include <memory> +#include <string> + +struct __tgt_device_image; + +namespace llvm { +class MemoryBuffer; + +namespace omp { +namespace target { +namespace plugin { +struct GenericDeviceTy; +} // namespace plugin + +/// The JIT infrastructure and caching mechanism. +struct JITEngine { + /// Function type for a callback that will be called after the backend is + /// called. + using PostProcessingFn = + std::function<Expected<std::unique_ptr<MemoryBuffer>>( + std::unique_ptr<MemoryBuffer>)>; + + JITEngine(Triple::ArchType TA); + + /// Run jit compilation if \p Image is a bitcode image, otherwise simply + /// return \p Image. It is expected to return a memory buffer containing the + /// generated device image that could be loaded to the device directly. + Expected<const __tgt_device_image *> + process(const __tgt_device_image &Image, + target::plugin::GenericDeviceTy &Device); + + /// Return true if \p Image is a bitcode image that can be JITed for the given + /// architecture. + Expected<bool> checkBitcodeImage(StringRef Buffer) const; + +private: + /// Compile the bitcode image \p Image and generate the binary image that can + /// be loaded to the target device of the triple \p Triple architecture \p + /// MCpu. \p PostProcessing will be called after codegen to handle cases such + /// as assember as an external tool. + Expected<const __tgt_device_image *> + compile(const __tgt_device_image &Image, const std::string &ComputeUnitKind, + PostProcessingFn PostProcessing); + + /// Create or retrieve the object image file from the file system or via + /// compilation of the \p Image. + Expected<std::unique_ptr<MemoryBuffer>> + getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx, + const std::string &ComputeUnitKind); + + /// Run backend, which contains optimization and code generation. + Expected<std::unique_ptr<MemoryBuffer>> + backend(Module &M, const std::string &ComputeUnitKind, unsigned OptLevel); + + /// Run optimization pipeline. + void opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M, + unsigned OptLevel); + + /// Run code generation. + void codegen(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M, + raw_pwrite_stream &OS); + + /// The target triple used by the JIT. + const Triple TT; + + struct ComputeUnitInfo { + /// LLVM Context in which the modules will be constructed. + LLVMContext Context; + + /// Output images generated from LLVM backend. + SmallVector<std::unique_ptr<MemoryBuffer>, 4> JITImages; + + /// A map of embedded IR images to JITed images. + DenseMap<const __tgt_device_image *, __tgt_device_image *> TgtImageMap; + }; + + /// Map from (march) "CPUs" (e.g., sm_80, or gfx90a), which we call compute + /// units as they are not CPUs, to the image information we cached for them. + StringMap<ComputeUnitInfo> ComputeUnitMap; + std::mutex ComputeUnitMapMutex; + + /// Control environment variables. + StringEnvar ReplacementObjectFileName = + StringEnvar("LIBOMPTARGET_JIT_REPLACEMENT_OBJECT"); + StringEnvar ReplacementModuleFileName = + StringEnvar("LIBOMPTARGET_JIT_REPLACEMENT_MODULE"); + StringEnvar PreOptIRModuleFileName = + StringEnvar("LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE"); + StringEnvar PostOptIRModuleFileName = + StringEnvar("LIBOMPTARGET_JIT_POST_OPT_IR_MODULE"); + UInt32Envar JITOptLevel = UInt32Envar("LIBOMPTARGET_JIT_OPT_LEVEL", 3); + BoolEnvar JITSkipOpt = BoolEnvar("LIBOMPTARGET_JIT_SKIP_OPT", false); +}; + +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H diff --git a/offload/plugins-nextgen/common/include/MemoryManager.h b/offload/plugins-nextgen/common/include/MemoryManager.h new file mode 100644 index 000000000000..fe1989930b76 --- /dev/null +++ b/offload/plugins-nextgen/common/include/MemoryManager.h @@ -0,0 +1,347 @@ +//===----------- MemoryManager.h - Target independent memory manager ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target independent memory manager. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H +#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H + +#include <cassert> +#include <functional> +#include <list> +#include <mutex> +#include <set> +#include <unordered_map> +#include <vector> + +#include "Shared/Debug.h" +#include "Shared/Utils.h" +#include "omptarget.h" + +/// Base class of per-device allocator. +class DeviceAllocatorTy { +public: + virtual ~DeviceAllocatorTy() = default; + + /// Allocate a memory of size \p Size . \p HstPtr is used to assist the + /// allocation. + virtual void *allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; + + /// Delete the pointer \p TgtPtr on the device + virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; +}; + +/// Class of memory manager. The memory manager is per-device by using +/// per-device allocator. Therefore, each plugin using memory manager should +/// have an allocator for each device. +class MemoryManagerTy { + static constexpr const size_t BucketSize[] = { + 0, 1U << 2, 1U << 3, 1U << 4, 1U << 5, 1U << 6, 1U << 7, + 1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13}; + + static constexpr const int NumBuckets = + sizeof(BucketSize) / sizeof(BucketSize[0]); + + /// Find the previous number that is power of 2 given a number that is not + /// power of 2. + static size_t floorToPowerOfTwo(size_t Num) { + Num |= Num >> 1; + Num |= Num >> 2; + Num |= Num >> 4; + Num |= Num >> 8; + Num |= Num >> 16; +#if INTPTR_MAX == INT64_MAX + Num |= Num >> 32; +#elif INTPTR_MAX == INT32_MAX + // Do nothing with 32-bit +#else +#error Unsupported architecture +#endif + Num += 1; + return Num >> 1; + } + + /// Find a suitable bucket + static int findBucket(size_t Size) { + const size_t F = floorToPowerOfTwo(Size); + + DP("findBucket: Size %zu is floored to %zu.\n", Size, F); + + int L = 0, H = NumBuckets - 1; + while (H - L > 1) { + int M = (L + H) >> 1; + if (BucketSize[M] == F) + return M; + if (BucketSize[M] > F) + H = M - 1; + else + L = M; + } + + assert(L >= 0 && L < NumBuckets && "L is out of range"); + + DP("findBucket: Size %zu goes to bucket %d\n", Size, L); + + return L; + } + + /// A structure stores the meta data of a target pointer + struct NodeTy { + /// Memory size + const size_t Size; + /// Target pointer + void *Ptr; + + /// Constructor + NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {} + }; + + /// To make \p NodePtrTy ordered when they're put into \p std::multiset. + struct NodeCmpTy { + bool operator()(const NodeTy &LHS, const NodeTy &RHS) const { + return LHS.Size < RHS.Size; + } + }; + + /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make + /// the look up procedure more efficient. + using FreeListTy = std::multiset<std::reference_wrapper<NodeTy>, NodeCmpTy>; + + /// A list of \p FreeListTy entries, each of which is a \p std::multiset of + /// Nodes whose size is less or equal to a specific bucket size. + std::vector<FreeListTy> FreeLists; + /// A list of mutex for each \p FreeListTy entry + std::vector<std::mutex> FreeListLocks; + /// A table to map from a target pointer to its node + std::unordered_map<void *, NodeTy> PtrToNodeTable; + /// The mutex for the table \p PtrToNodeTable + std::mutex MapTableLock; + + /// The reference to a device allocator + DeviceAllocatorTy &DeviceAllocator; + + /// The threshold to manage memory using memory manager. If the request size + /// is larger than \p SizeThreshold, the allocation will not be managed by the + /// memory manager. + size_t SizeThreshold = 1U << 13; + + /// Request memory from target device + void *allocateOnDevice(size_t Size, void *HstPtr) const { + return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE); + } + + /// Deallocate data on device + int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); } + + /// This function is called when it tries to allocate memory on device but the + /// device returns out of memory. It will first free all memory in the + /// FreeList and try to allocate again. + void *freeAndAllocate(size_t Size, void *HstPtr) { + std::vector<void *> RemoveList; + + // Deallocate all memory in FreeList + for (int I = 0; I < NumBuckets; ++I) { + FreeListTy &List = FreeLists[I]; + std::lock_guard<std::mutex> Lock(FreeListLocks[I]); + if (List.empty()) + continue; + for (const NodeTy &N : List) { + deleteOnDevice(N.Ptr); + RemoveList.push_back(N.Ptr); + } + FreeLists[I].clear(); + } + + // Remove all nodes in the map table which have been released + if (!RemoveList.empty()) { + std::lock_guard<std::mutex> LG(MapTableLock); + for (void *P : RemoveList) + PtrToNodeTable.erase(P); + } + + // Try allocate memory again + return allocateOnDevice(Size, HstPtr); + } + + /// The goal is to allocate memory on the device. It first tries to + /// allocate directly on the device. If a \p nullptr is returned, it might + /// be because the device is OOM. In that case, it will free all unused + /// memory and then try again. + void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) { + void *TgtPtr = allocateOnDevice(Size, HstPtr); + // We cannot get memory from the device. It might be due to OOM. Let's + // free all memory in FreeLists and try again. + if (TgtPtr == nullptr) { + DP("Failed to get memory on device. Free all memory in FreeLists and " + "try again.\n"); + TgtPtr = freeAndAllocate(Size, HstPtr); + } + + if (TgtPtr == nullptr) + DP("Still cannot get memory on device probably because the device is " + "OOM.\n"); + + return TgtPtr; + } + +public: + /// Constructor. If \p Threshold is non-zero, then the default threshold will + /// be overwritten by \p Threshold. + MemoryManagerTy(DeviceAllocatorTy &DeviceAllocator, size_t Threshold = 0) + : FreeLists(NumBuckets), FreeListLocks(NumBuckets), + DeviceAllocator(DeviceAllocator) { + if (Threshold) + SizeThreshold = Threshold; + } + + /// Destructor + ~MemoryManagerTy() { + for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end(); + ++Itr) { + assert(Itr->second.Ptr && "nullptr in map table"); + deleteOnDevice(Itr->second.Ptr); + } + } + + /// Allocate memory of size \p Size from target device. \p HstPtr is used to + /// assist the allocation. + void *allocate(size_t Size, void *HstPtr) { + // If the size is zero, we will not bother the target device. Just return + // nullptr directly. + if (Size == 0) + return nullptr; + + DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n", + Size, DPxPTR(HstPtr)); + + // If the size is greater than the threshold, allocate it directly from + // device. + if (Size > SizeThreshold) { + DP("%zu is greater than the threshold %zu. Allocate it directly from " + "device\n", + Size, SizeThreshold); + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + + DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); + + return TgtPtr; + } + + NodeTy *NodePtr = nullptr; + + // Try to get a node from FreeList + { + const int B = findBucket(Size); + FreeListTy &List = FreeLists[B]; + + NodeTy TempNode(Size, nullptr); + std::lock_guard<std::mutex> LG(FreeListLocks[B]); + const auto Itr = List.find(TempNode); + + if (Itr != List.end()) { + NodePtr = &Itr->get(); + List.erase(Itr); + } + } + + if (NodePtr != nullptr) + DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr)); + + // We cannot find a valid node in FreeLists. Let's allocate on device and + // create a node for it. + if (NodePtr == nullptr) { + DP("Cannot find a node in the FreeLists. Allocate on device.\n"); + // Allocate one on device + void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + + if (TgtPtr == nullptr) + return nullptr; + + // Create a new node and add it into the map table + { + std::lock_guard<std::mutex> Guard(MapTableLock); + auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr)); + NodePtr = &Itr.first->second; + } + + DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n", + DPxPTR(NodePtr), DPxPTR(TgtPtr), Size); + } + + assert(NodePtr && "NodePtr should not be nullptr at this point"); + + return NodePtr->Ptr; + } + + /// Deallocate memory pointed by \p TgtPtr + int free(void *TgtPtr) { + DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr)); + + NodeTy *P = nullptr; + + // Look it up into the table + { + std::lock_guard<std::mutex> G(MapTableLock); + auto Itr = PtrToNodeTable.find(TgtPtr); + + // We don't remove the node from the map table because the map does not + // change. + if (Itr != PtrToNodeTable.end()) + P = &Itr->second; + } + + // The memory is not managed by the manager + if (P == nullptr) { + DP("Cannot find its node. Delete it on device directly.\n"); + return deleteOnDevice(TgtPtr); + } + + // Insert the node to the free list + const int B = findBucket(P->Size); + + DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B); + + { + std::lock_guard<std::mutex> G(FreeListLocks[B]); + FreeLists[B].insert(*P); + } + + return OFFLOAD_SUCCESS; + } + + /// Get the size threshold from the environment variable + /// \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD . Returns a <tt> + /// std::pair<size_t, bool> </tt> where the first element represents the + /// threshold and the second element represents whether user disables memory + /// manager explicitly by setting the var to 0. If user doesn't specify + /// anything, returns <0, true>. + static std::pair<size_t, bool> getSizeThresholdFromEnv() { + static UInt32Envar MemoryManagerThreshold( + "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD", 0); + + size_t Threshold = MemoryManagerThreshold.get(); + + if (MemoryManagerThreshold.isPresent() && Threshold == 0) { + DP("Disabled memory manager as user set " + "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=0.\n"); + return std::make_pair(0, false); + } + + return std::make_pair(Threshold, true); + } +}; + +// GCC still cannot handle the static data member like Clang so we still need +// this part. +constexpr const size_t MemoryManagerTy::BucketSize[]; +constexpr const int MemoryManagerTy::NumBuckets; + +#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h new file mode 100644 index 000000000000..79e8464bfda5 --- /dev/null +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -0,0 +1,1537 @@ +//===- PluginInterface.h - Target independent plugin device interface -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H + +#include <cstddef> +#include <cstdint> +#include <deque> +#include <list> +#include <map> +#include <shared_mutex> +#include <vector> + +#include "Shared/Debug.h" +#include "Shared/Environment.h" +#include "Shared/EnvironmentVar.h" +#include "Shared/Requirements.h" +#include "Shared/Utils.h" + +#include "GlobalHandler.h" +#include "JIT.h" +#include "MemoryManager.h" +#include "RPC.h" +#include "omptarget.h" + +#ifdef OMPT_SUPPORT +#include "omp-tools.h" +#endif + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MemoryBufferRef.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TargetParser/Triple.h" + +namespace llvm { +namespace omp { +namespace target { + +namespace plugin { + +struct GenericPluginTy; +struct GenericKernelTy; +struct GenericDeviceTy; + +/// Class that wraps the __tgt_async_info to simply its usage. In case the +/// object is constructed without a valid __tgt_async_info, the object will use +/// an internal one and will synchronize the current thread with the pending +/// operations when calling AsyncInfoWrapperTy::finalize(). This latter function +/// must be called before destroying the wrapper object. +struct AsyncInfoWrapperTy { + AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr); + + ~AsyncInfoWrapperTy() { + assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized"); + } + + /// Get the raw __tgt_async_info pointer. + operator __tgt_async_info *() const { return AsyncInfoPtr; } + + /// Indicate whether there is queue. + bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); } + + /// Get the queue. + template <typename Ty> Ty getQueueAs() { + static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue), + "Queue is not of the same size as target type"); + return static_cast<Ty>(AsyncInfoPtr->Queue); + } + + /// Set the queue. + template <typename Ty> void setQueueAs(Ty Queue) { + static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue), + "Queue is not of the same size as target type"); + assert(!AsyncInfoPtr->Queue && "Overwriting queue"); + AsyncInfoPtr->Queue = Queue; + } + + /// Synchronize with the __tgt_async_info's pending operations if it's the + /// internal async info. The error associated to the aysnchronous operations + /// issued in this queue must be provided in \p Err. This function will update + /// the error parameter with the result of the synchronization if it was + /// actually executed. This function must be called before destroying the + /// object and only once. + void finalize(Error &Err); + + /// Register \p Ptr as an associated alloction that is freed after + /// finalization. + void freeAllocationAfterSynchronization(void *Ptr) { + AsyncInfoPtr->AssociatedAllocations.push_back(Ptr); + } + +private: + GenericDeviceTy &Device; + __tgt_async_info LocalAsyncInfo; + __tgt_async_info *AsyncInfoPtr; +}; + +/// The information level represents the level of a key-value property in the +/// info tree print (i.e. indentation). The first level should be the default. +enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 }; + +/// Class for storing device information and later be printed. An object of this +/// type acts as a queue of key-value properties. Each property has a key, a +/// a value, and an optional unit for the value. For printing purposes, the +/// information can be classified into several levels. These levels are useful +/// for defining sections and subsections. Thus, each key-value property also +/// has an additional field indicating to which level belongs to. Notice that +/// we use the level to determine the indentation of the key-value property at +/// printing time. See the enum InfoLevelKind for the list of accepted levels. +class InfoQueueTy { + struct InfoQueueEntryTy { + std::string Key; + std::string Value; + std::string Units; + uint64_t Level; + }; + + std::deque<InfoQueueEntryTy> Queue; + +public: + /// Add a new info entry to the queue. The entry requires at least a key + /// string in \p Key. The value in \p Value is optional and can be any type + /// that is representable as a string. The units in \p Units is optional and + /// must be a string. The info level is a template parameter that defaults to + /// the first level (top level). + template <InfoLevelKind L = InfoLevel1, typename T = std::string> + void add(const std::string &Key, T Value = T(), + const std::string &Units = std::string()) { + assert(!Key.empty() && "Invalid info key"); + + // Convert the value to a string depending on its type. + if constexpr (std::is_same_v<T, bool>) + Queue.push_back({Key, Value ? "Yes" : "No", Units, L}); + else if constexpr (std::is_arithmetic_v<T>) + Queue.push_back({Key, std::to_string(Value), Units, L}); + else + Queue.push_back({Key, Value, Units, L}); + } + + /// Print all info entries added to the queue. + void print() const { + // We print four spances for each level. + constexpr uint64_t IndentSize = 4; + + // Find the maximum key length (level + key) to compute the individual + // indentation of each entry. + uint64_t MaxKeySize = 0; + for (const auto &Entry : Queue) { + uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize; + if (KeySize > MaxKeySize) + MaxKeySize = KeySize; + } + + // Print all info entries. + for (const auto &Entry : Queue) { + // Compute the indentations for the current entry. + uint64_t KeyIndentSize = Entry.Level * IndentSize; + uint64_t ValIndentSize = + MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize; + + llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key + << std::string(ValIndentSize, ' ') << Entry.Value + << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n"; + } + } +}; + +/// Class wrapping a __tgt_device_image and its offload entry table on a +/// specific device. This class is responsible for storing and managing +/// the offload entries for an image on a device. +class DeviceImageTy { + /// Image identifier within the corresponding device. Notice that this id is + /// not unique between different device; they may overlap. + int32_t ImageId; + + /// The pointer to the raw __tgt_device_image. + const __tgt_device_image *TgtImage; + const __tgt_device_image *TgtImageBitcode; + + /// Reference to the device this image is loaded on. + GenericDeviceTy &Device; + + /// If this image has any global destructors that much be called. + /// FIXME: This is only required because we currently have no invariants + /// towards the lifetime of the underlying image. We should either copy + /// the image into memory locally or erase the pointers after init. + bool PendingGlobalDtors; + +public: + DeviceImageTy(int32_t Id, GenericDeviceTy &Device, + const __tgt_device_image *Image) + : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device), + PendingGlobalDtors(false) { + assert(TgtImage && "Invalid target image"); + } + + /// Get the image identifier within the device. + int32_t getId() const { return ImageId; } + + /// Get the device that this image is loaded onto. + GenericDeviceTy &getDevice() const { return Device; } + + /// Get the pointer to the raw __tgt_device_image. + const __tgt_device_image *getTgtImage() const { return TgtImage; } + + void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) { + this->TgtImageBitcode = TgtImageBitcode; + } + + const __tgt_device_image *getTgtImageBitcode() const { + return TgtImageBitcode; + } + + /// Get the image starting address. + void *getStart() const { return TgtImage->ImageStart; } + + /// Get the image size. + size_t getSize() const { + return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); + } + + /// Get a memory buffer reference to the whole image. + MemoryBufferRef getMemoryBuffer() const { + return MemoryBufferRef(StringRef((const char *)getStart(), getSize()), + "Image"); + } + /// Accessors to the boolean value + bool setPendingGlobalDtors() { return PendingGlobalDtors = true; } + bool hasPendingGlobalDtors() const { return PendingGlobalDtors; } +}; + +/// Class implementing common functionalities of offload kernels. Each plugin +/// should define the specific kernel class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericKernelTy { + /// Construct a kernel with a name and a execution mode. + GenericKernelTy(const char *Name) + : Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {} + + virtual ~GenericKernelTy() {} + + /// Initialize the kernel object from a specific device. + Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image); + virtual Error initImpl(GenericDeviceTy &GenericDevice, + DeviceImageTy &Image) = 0; + + /// Launch the kernel on the specific device. The device must be the same + /// one used to initialize the kernel. + Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const; + virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, + void *Args, + AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; + + /// Get the kernel name. + const char *getName() const { return Name; } + + /// Return true if this kernel is a constructor or destructor. + bool isCtorOrDtor() const { + // TODO: This is not a great solution and should be revisited. + return StringRef(Name).ends_with("tor"); + } + + /// Get the kernel image. + DeviceImageTy &getImage() const { + assert(ImagePtr && "Kernel is not initialized!"); + return *ImagePtr; + } + + /// Return the kernel environment object for kernel \p Name. + const KernelEnvironmentTy &getKernelEnvironmentForKernel() { + return KernelEnvironment; + } + + /// Return a device pointer to a new kernel launch environment. + Expected<KernelLaunchEnvironmentTy *> + getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version, + AsyncInfoWrapperTy &AsyncInfo) const; + + /// Indicate whether an execution mode is valid. + static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) { + switch (ExecutionMode) { + case OMP_TGT_EXEC_MODE_SPMD: + case OMP_TGT_EXEC_MODE_GENERIC: + case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + return true; + } + return false; + } + +protected: + /// Get the execution mode name of the kernel. + const char *getExecutionModeName() const { + switch (KernelEnvironment.Configuration.ExecMode) { + case OMP_TGT_EXEC_MODE_SPMD: + return "SPMD"; + case OMP_TGT_EXEC_MODE_GENERIC: + return "Generic"; + case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + return "Generic-SPMD"; + } + llvm_unreachable("Unknown execution mode!"); + } + + /// Prints generic kernel launch information. + Error printLaunchInfo(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, uint32_t NumThreads, + uint64_t NumBlocks) const; + + /// Prints plugin-specific kernel launch information after generic kernel + /// launch information + virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumThreads, + uint64_t NumBlocks) const; + +private: + /// Prepare the arguments before launching the kernel. + void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, uint32_t &NumArgs, + llvm::SmallVectorImpl<void *> &Args, + llvm::SmallVectorImpl<void *> &Ptrs, + KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const; + + /// Get the number of threads and blocks for the kernel based on the + /// user-defined threads and block clauses. + uint32_t getNumThreads(GenericDeviceTy &GenericDevice, + uint32_t ThreadLimitClause[3]) const; + + /// The number of threads \p NumThreads can be adjusted by this method. + /// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via + /// thread_limit clause. + uint64_t getNumBlocks(GenericDeviceTy &GenericDevice, + uint32_t BlockLimitClause[3], uint64_t LoopTripCount, + uint32_t &NumThreads, bool IsNumThreadsFromUser) const; + + /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. + bool isGenericSPMDMode() const { + return KernelEnvironment.Configuration.ExecMode == + OMP_TGT_EXEC_MODE_GENERIC_SPMD; + } + bool isGenericMode() const { + return KernelEnvironment.Configuration.ExecMode == + OMP_TGT_EXEC_MODE_GENERIC; + } + bool isSPMDMode() const { + return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD; + } + + /// The kernel name. + const char *Name; + + /// The image that contains this kernel. + DeviceImageTy *ImagePtr = nullptr; + +protected: + /// The preferred number of threads to run the kernel. + uint32_t PreferredNumThreads; + + /// The maximum number of threads which the kernel could leverage. + uint32_t MaxNumThreads; + + /// The kernel environment, including execution flags. + KernelEnvironmentTy KernelEnvironment; + + /// The prototype kernel launch environment. + KernelLaunchEnvironmentTy KernelLaunchEnvironment; + + /// If the kernel is a bare kernel. + bool IsBareKernel = false; +}; + +/// Class representing a map of host pinned allocations. We track these pinned +/// allocations, so memory tranfers invloving these buffers can be optimized. +class PinnedAllocationMapTy { + + /// Struct representing a map entry. + struct EntryTy { + /// The host pointer of the pinned allocation. + void *HstPtr; + + /// The pointer that devices' driver should use to transfer data from/to the + /// pinned allocation. In most plugins, this pointer will be the same as the + /// host pointer above. + void *DevAccessiblePtr; + + /// The size of the pinned allocation. + size_t Size; + + /// Indicate whether the allocation was locked from outside the plugin, for + /// instance, from the application. The externally locked allocations are + /// not unlocked by the plugin when unregistering the last user. + bool ExternallyLocked; + + /// The number of references to the pinned allocation. The allocation should + /// remain pinned and registered to the map until the number of references + /// becomes zero. + mutable size_t References; + + /// Create an entry with the host and device acessible pointers, the buffer + /// size, and a boolean indicating whether the buffer was locked externally. + EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size, + bool ExternallyLocked) + : HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size), + ExternallyLocked(ExternallyLocked), References(1) {} + + /// Utility constructor used for std::set searches. + EntryTy(void *HstPtr) + : HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0), + ExternallyLocked(false), References(0) {} + }; + + /// Comparator of mep entries. Use the host pointer to enforce an order + /// between entries. + struct EntryCmpTy { + bool operator()(const EntryTy &Left, const EntryTy &Right) const { + return Left.HstPtr < Right.HstPtr; + } + }; + + typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy; + + /// The map of host pinned allocations. + PinnedAllocSetTy Allocs; + + /// The mutex to protect accesses to the map. + mutable std::shared_mutex Mutex; + + /// Reference to the corresponding device. + GenericDeviceTy &Device; + + /// Indicate whether mapped host buffers should be locked automatically. + bool LockMappedBuffers; + + /// Indicate whether failures when locking mapped buffers should be ingored. + bool IgnoreLockMappedFailures; + + /// Find an allocation that intersects with \p HstPtr pointer. Assume the + /// map's mutex is acquired. + const EntryTy *findIntersecting(const void *HstPtr) const { + if (Allocs.empty()) + return nullptr; + + // Search the first allocation with starting address that is not less than + // the buffer address. + auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)}); + + // Direct match of starting addresses. + if (It != Allocs.end() && It->HstPtr == HstPtr) + return &(*It); + + // Not direct match but may be a previous pinned allocation in the map which + // contains the buffer. Return false if there is no such a previous + // allocation. + if (It == Allocs.begin()) + return nullptr; + + // Move to the previous pinned allocation. + --It; + + // The buffer is not contained in the pinned allocation. + if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr) + return &(*It); + + // None found. + return nullptr; + } + + /// Insert an entry to the map representing a locked buffer. The number of + /// references is set to one. + Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size, + bool ExternallyLocked = false); + + /// Erase an existing entry from the map. + Error eraseEntry(const EntryTy &Entry); + + /// Register a new user into an entry that represents a locked buffer. Check + /// also that the registered buffer with \p HstPtr address and \p Size is + /// actually contained into the entry. + Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size); + + /// Unregister a user from the entry and return whether it is the last user. + /// If it is the last user, the entry will have to be removed from the map + /// and unlock the entry's host buffer (if necessary). + Expected<bool> unregisterEntryUse(const EntryTy &Entry); + + /// Indicate whether the first range A fully contains the second range B. + static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { + void *EndA = advanceVoidPtr(PtrA, SizeA); + void *EndB = advanceVoidPtr(PtrB, SizeB); + return (PtrB >= PtrA && EndB <= EndA); + } + + /// Indicate whether the first range A intersects with the second range B. + static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { + void *EndA = advanceVoidPtr(PtrA, SizeA); + void *EndB = advanceVoidPtr(PtrB, SizeB); + return (PtrA < EndB && PtrB < EndA); + } + +public: + /// Create the map of pinned allocations corresponding to a specific device. + PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) { + + // Envar that indicates whether mapped host buffers should be locked + // automatically. The possible values are boolean (on/off) and a special: + // off: Mapped host buffers are not locked. + // on: Mapped host buffers are locked in a best-effort approach. + // Failure to lock the buffers are silent. + // mandatory: Mapped host buffers are always locked and failures to lock + // a buffer results in a fatal error. + StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS", + "off"); + + bool Enabled; + if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) { + // Parsed as a boolean value. Enable the feature if necessary. + LockMappedBuffers = Enabled; + IgnoreLockMappedFailures = true; + } else if (OMPX_LockMappedBuffers.get() == "mandatory") { + // Enable the feature and failures are fatal. + LockMappedBuffers = true; + IgnoreLockMappedFailures = false; + } else { + // Disable by default. + DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n", + OMPX_LockMappedBuffers.get().data()); + LockMappedBuffers = false; + } + } + + /// Register a buffer that was recently allocated as a locked host buffer. + /// None of the already registered pinned allocations should intersect with + /// this new one. The registration requires the host pointer in \p HstPtr, + /// the device accessible pointer in \p DevAccessiblePtr, and the size of the + /// allocation in \p Size. The allocation must be unregistered using the + /// unregisterHostBuffer function. + Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size); + + /// Unregister a host pinned allocation passing the host pointer which was + /// previously registered using the registerHostBuffer function. When calling + /// this function, the pinned allocation cannot have any other user and will + /// not be unlocked by this function. + Error unregisterHostBuffer(void *HstPtr); + + /// Lock the host buffer at \p HstPtr or register a new user if it intersects + /// with an already existing one. A partial overlapping with extension is not + /// allowed. The function returns the device accessible pointer of the pinned + /// buffer. The buffer must be unlocked using the unlockHostBuffer function. + Expected<void *> lockHostBuffer(void *HstPtr, size_t Size); + + /// Unlock the host buffer at \p HstPtr or unregister a user if other users + /// are still using the pinned allocation. If this was the last user, the + /// pinned allocation is removed from the map and the memory is unlocked. + Error unlockHostBuffer(void *HstPtr); + + /// Lock or register a host buffer that was recently mapped by libomptarget. + /// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is + /// enabled. Even if not enabled, externally locked buffers are registered + /// in order to optimize their transfers. + Error lockMappedHostBuffer(void *HstPtr, size_t Size); + + /// Unlock or unregister a host buffer that was unmapped by libomptarget. + Error unlockUnmappedHostBuffer(void *HstPtr); + + /// Return the device accessible pointer associated to the host pinned + /// allocation which the \p HstPtr belongs, if any. Return null in case the + /// \p HstPtr does not belong to any host pinned allocation. The device + /// accessible pointer is the one that devices should use for data transfers + /// that involve a host pinned buffer. + void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const { + std::shared_lock<std::shared_mutex> Lock(Mutex); + + // Find the intersecting allocation if any. + const EntryTy *Entry = findIntersecting(HstPtr); + if (!Entry) + return nullptr; + + return advanceVoidPtr(Entry->DevAccessiblePtr, + getPtrDiff(HstPtr, Entry->HstPtr)); + } + + /// Check whether a buffer belongs to a registered host pinned allocation. + bool isHostPinnedBuffer(const void *HstPtr) const { + std::shared_lock<std::shared_mutex> Lock(Mutex); + + // Return whether there is an intersecting allocation. + return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr); + } +}; + +/// Class implementing common functionalities of offload devices. Each plugin +/// should define the specific device class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericDeviceTy : public DeviceAllocatorTy { + /// Construct a device with its device id within the plugin, the number of + /// devices in the plugin and the grid values for that kind of device. + GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, + const llvm::omp::GV &GridValues); + + /// Get the device identifier within the corresponding plugin. Notice that + /// this id is not unique between different plugins; they may overlap. + int32_t getDeviceId() const { return DeviceId; } + + /// Set the context of the device if needed, before calling device-specific + /// functions. Plugins may implement this function as a no-op if not needed. + virtual Error setContext() = 0; + + /// Initialize the device. After this call, the device should be already + /// working and ready to accept queries or modifications. + Error init(GenericPluginTy &Plugin); + virtual Error initImpl(GenericPluginTy &Plugin) = 0; + + /// Deinitialize the device and free all its resources. After this call, the + /// device is no longer considered ready, so no queries or modifications are + /// allowed. + Error deinit(GenericPluginTy &Plugin); + virtual Error deinitImpl() = 0; + + /// Load the binary image into the device and return the target table. + Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin, + const __tgt_device_image *TgtImage); + virtual Expected<DeviceImageTy *> + loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0; + + /// Setup the device environment if needed. Notice this setup may not be run + /// on some plugins. By default, it will be executed, but plugins can change + /// this behavior by overriding the shouldSetupDeviceEnvironment function. + Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); + + /// Setup the global device memory pool, if the plugin requires one. + Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, + uint64_t PoolSize); + + // Setup the RPC server for this device if needed. This may not run on some + // plugins like the CPU targets. By default, it will not be executed so it is + // up to the target to override this using the shouldSetupRPCServer function. + Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image); + + /// Synchronize the current thread with the pending operations on the + /// __tgt_async_info structure. + Error synchronize(__tgt_async_info *AsyncInfo); + virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0; + + /// Invokes any global constructors on the device if present and is required + /// by the target. + virtual Error callGlobalConstructors(GenericPluginTy &Plugin, + DeviceImageTy &Image) { + return Error::success(); + } + + /// Invokes any global destructors on the device if present and is required + /// by the target. + virtual Error callGlobalDestructors(GenericPluginTy &Plugin, + DeviceImageTy &Image) { + return Error::success(); + } + + /// Query for the completion of the pending operations on the __tgt_async_info + /// structure in a non-blocking manner. + Error queryAsync(__tgt_async_info *AsyncInfo); + virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0; + + /// Check whether the architecture supports VA management + virtual bool supportVAManagement() const { return false; } + + /// Get the total device memory size + virtual Error getDeviceMemorySize(uint64_t &DSize); + + /// Allocates \p RSize bytes (rounded up to page size) and hints the driver to + /// map it to \p VAddr. The obtained address is stored in \p Addr. At return + /// \p RSize contains the actual size which can be equal or larger than the + /// requested size. + virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize); + + /// De-allocates device memory and unmaps the virtual address \p VAddr + virtual Error memoryVAUnMap(void *VAddr, size_t Size); + + /// Allocate data on the device or involving the device. + Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind); + + /// Deallocate data from the device or involving the device. + Error dataDelete(void *TgtPtr, TargetAllocTy Kind); + + /// Pin host memory to optimize transfers and return the device accessible + /// pointer that devices should use for memory transfers involving the host + /// pinned allocation. + Expected<void *> dataLock(void *HstPtr, int64_t Size) { + return PinnedAllocs.lockHostBuffer(HstPtr, Size); + } + + /// Unpin a host memory buffer that was previously pinned. + Error dataUnlock(void *HstPtr) { + return PinnedAllocs.unlockHostBuffer(HstPtr); + } + + /// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific + /// API and return the device accessible pointer. + virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0; + + /// Unlock a previously locked host buffer starting at \p HstPtr. + virtual Error dataUnlockImpl(void *HstPtr) = 0; + + /// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped + /// buffer. This means that libomptarget created a new mapping of that host + /// buffer (e.g., because a user OpenMP target map) and the buffer may be used + /// as source/destination of memory transfers. We can use this information to + /// lock the host buffer and optimize its memory transfers. + Error notifyDataMapped(void *HstPtr, int64_t Size) { + return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size); + } + + /// Mark the host buffer with address \p HstPtr as unmapped. This means that + /// libomptarget removed an existing mapping. If the plugin locked the buffer + /// in notifyDataMapped, this function should unlock it. + Error notifyDataUnmapped(void *HstPtr) { + return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr); + } + + /// Check whether the host buffer with address \p HstPtr is pinned by the + /// underlying vendor-specific runtime (if any). Retrieve the host pointer, + /// the device accessible pointer and the size of the original pinned buffer. + virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, + void *&BaseDevAccessiblePtr, + size_t &BaseSize) const = 0; + + /// Submit data to the device (host to device transfer). + Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Retrieve data from the device (device to host transfer). + Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Exchange data between devices (device to device transfer). Calling this + /// function is only valid if GenericPlugin::isDataExchangable() passing the + /// two devices returns true. + Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr, + int64_t Size, __tgt_async_info *AsyncInfo); + virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Run the kernel associated with \p EntryPtr + Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, + KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo); + + /// Initialize a __tgt_async_info structure. Related to interop features. + Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr); + virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Initialize a __tgt_device_info structure. Related to interop features. + Error initDeviceInfo(__tgt_device_info *DeviceInfo); + virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0; + + /// Create an event. + Error createEvent(void **EventPtrStorage); + virtual Error createEventImpl(void **EventPtrStorage) = 0; + + /// Destroy an event. + Error destroyEvent(void *Event); + virtual Error destroyEventImpl(void *EventPtr) = 0; + + /// Start the recording of the event. + Error recordEvent(void *Event, __tgt_async_info *AsyncInfo); + virtual Error recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Wait for an event to finish. Notice this wait is asynchronous if the + /// __tgt_async_info is not nullptr. + Error waitEvent(void *Event, __tgt_async_info *AsyncInfo); + virtual Error waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Synchronize the current thread with the event. + Error syncEvent(void *EventPtr); + virtual Error syncEventImpl(void *EventPtr) = 0; + + /// Print information about the device. + Error printInfo(); + virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0; + + /// Getters of the grid values. + uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; } + uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; } + uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; } + uint32_t getDefaultNumThreads() const { + return GridValues.GV_Default_WG_Size; + } + uint32_t getDefaultNumBlocks() const { + return GridValues.GV_Default_Num_Teams; + } + uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } + virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } + + /// Get target compute unit kind (e.g., sm_80, or gfx908). + virtual std::string getComputeUnitKind() const { return "unknown"; } + + /// Post processing after jit backend. The ownership of \p MB will be taken. + virtual Expected<std::unique_ptr<MemoryBuffer>> + doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const { + return std::move(MB); + } + + /// The minimum number of threads we use for a low-trip count combined loop. + /// Instead of using more threads we increase the outer (block/team) + /// parallelism. + /// @see OMPX_MinThreadsForLowTripCount + virtual uint32_t getMinThreadsForLowTripCountLoop() { + return OMPX_MinThreadsForLowTripCount; + } + + /// Get the total amount of hardware parallelism supported by the target + /// device. This is the total amount of warps or wavefronts that can be + /// resident on the device simultaneously. + virtual uint64_t getHardwareParallelism() const { return 0; } + + /// Get the RPC server running on this device. + RPCServerTy *getRPCServer() const { return RPCServer; } + + /// The number of parallel RPC ports to use on the device. In general, this + /// should be roughly equivalent to the amount of hardware parallelism the + /// device can support. This is because GPUs in general do not have forward + /// progress guarantees, so we minimize thread level dependencies by + /// allocating enough space such that each device thread can have a port. This + /// is likely overly pessimistic in the average case, but guarantees no + /// deadlocks at the cost of memory. This must be overloaded by targets + /// expecting to use the RPC server. + virtual uint64_t requestedRPCPortCount() const { + assert(!shouldSetupRPCServer() && "Default implementation cannot be used"); + return 0; + } + + virtual Error getDeviceStackSize(uint64_t &V) = 0; + + /// Returns true if current plugin architecture is an APU + /// and unified_shared_memory was not requested by the program. + bool useAutoZeroCopy(); + virtual bool useAutoZeroCopyImpl() { return false; } + + /// Allocate and construct a kernel object. + virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0; + + /// Reference to the underlying plugin that created this device. + GenericPluginTy &Plugin; + +private: + /// Get and set the stack size and heap size for the device. If not used, the + /// plugin can implement the setters as no-op and setting the output + /// value to zero for the getters. + virtual Error setDeviceStackSize(uint64_t V) = 0; + virtual Error getDeviceHeapSize(uint64_t &V) = 0; + virtual Error setDeviceHeapSize(uint64_t V) = 0; + + /// Indicate whether the device should setup the device environment. Notice + /// that returning false in this function will change the behavior of the + /// setupDeviceEnvironment() function. + virtual bool shouldSetupDeviceEnvironment() const { return true; } + + /// Indicate whether the device should setup the global device memory pool. If + /// false is return the value on the device will be uninitialized. + virtual bool shouldSetupDeviceMemoryPool() const { return true; } + + /// Indicate whether or not the device should setup the RPC server. This is + /// only necessary for unhosted targets like the GPU. + virtual bool shouldSetupRPCServer() const { return false; } + + /// Pointer to the memory manager or nullptr if not available. + MemoryManagerTy *MemoryManager; + + /// Environment variables defined by the OpenMP standard. + Int32Envar OMP_TeamLimit; + Int32Envar OMP_NumTeams; + Int32Envar OMP_TeamsThreadLimit; + + /// Environment variables defined by the LLVM OpenMP implementation. + Int32Envar OMPX_DebugKind; + UInt32Envar OMPX_SharedMemorySize; + UInt64Envar OMPX_TargetStackSize; + UInt64Envar OMPX_TargetHeapSize; + + /// Environment flag to set the minimum number of threads we use for a + /// low-trip count combined loop. Instead of using more threads we increase + /// the outer (block/team) parallelism. + UInt32Envar OMPX_MinThreadsForLowTripCount = + UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32); + +protected: + /// Environment variables defined by the LLVM OpenMP implementation + /// regarding the initial number of streams and events. + UInt32Envar OMPX_InitialNumStreams; + UInt32Envar OMPX_InitialNumEvents; + + /// Array of images loaded into the device. Images are automatically + /// deallocated by the allocator. + llvm::SmallVector<DeviceImageTy *> LoadedImages; + + /// The identifier of the device within the plugin. Notice this is not a + /// global device id and is not the device id visible to the OpenMP user. + const int32_t DeviceId; + + /// The default grid values used for this device. + llvm::omp::GV GridValues; + + /// Enumeration used for representing the current state between two devices + /// two devices (both under the same plugin) for the peer access between them. + /// The states can be a) PENDING when the state has not been queried and needs + /// to be queried, b) AVAILABLE when the peer access is available to be used, + /// and c) UNAVAILABLE if the system does not allow it. + enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING }; + + /// Array of peer access states with the rest of devices. This means that if + /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE, + /// the device I can access device J's memory directly. However, notice this + /// does not mean that device J can access device I's memory directly. + llvm::SmallVector<PeerAccessState> PeerAccesses; + std::mutex PeerAccessesLock; + + /// Map of host pinned allocations used for optimize device transfers. + PinnedAllocationMapTy PinnedAllocs; + + /// A pointer to an RPC server instance attached to this device if present. + /// This is used to run the RPC server during task synchronization. + RPCServerTy *RPCServer; + +#ifdef OMPT_SUPPORT + /// OMPT callback functions +#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr; + FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback) +#undef defineOmptCallback + + /// Internal representation for OMPT device (initialize & finalize) + std::atomic<bool> OmptInitialized; +#endif + +private: + DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; + DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; +}; + +/// Class implementing common functionalities of offload plugins. Each plugin +/// should define the specific plugin class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericPluginTy { + + /// Construct a plugin instance. + GenericPluginTy(Triple::ArchType TA) + : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr), JIT(TA), + RPCServer(nullptr) {} + + virtual ~GenericPluginTy() {} + + /// Initialize the plugin. + Error init(); + + /// Initialize the plugin and return the number of available devices. + virtual Expected<int32_t> initImpl() = 0; + + /// Deinitialize the plugin and release the resources. + Error deinit(); + virtual Error deinitImpl() = 0; + + /// Create a new device for the underlying plugin. + virtual GenericDeviceTy *createDevice(GenericPluginTy &Plugin, + int32_t DeviceID, + int32_t NumDevices) = 0; + + /// Create a new global handler for the underlying plugin. + virtual GenericGlobalHandlerTy *createGlobalHandler() = 0; + + /// Get the reference to the device with a certain device id. + GenericDeviceTy &getDevice(int32_t DeviceId) { + assert(isValidDeviceId(DeviceId) && "Invalid device id"); + assert(Devices[DeviceId] && "Device is unitialized"); + + return *Devices[DeviceId]; + } + + /// Get the number of active devices. + int32_t getNumDevices() const { return NumDevices; } + + /// Get the plugin-specific device identifier offset. + int32_t getDeviceIdStartIndex() const { return DeviceIdStartIndex; } + + /// Set the plugin-specific device identifier offset. + void setDeviceIdStartIndex(int32_t Offset) { DeviceIdStartIndex = Offset; } + + /// Get the ELF code to recognize the binary image of this plugin. + virtual uint16_t getMagicElfBits() const = 0; + + /// Get the target triple of this plugin. + virtual Triple::ArchType getTripleArch() const = 0; + + /// Allocate a structure using the internal allocator. + template <typename Ty> Ty *allocate() { + return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty))); + } + + /// Get the reference to the global handler of this plugin. + GenericGlobalHandlerTy &getGlobalHandler() { + assert(GlobalHandler && "Global handler not initialized"); + return *GlobalHandler; + } + + /// Get the reference to the JIT used for all devices connected to this + /// plugin. + JITEngine &getJIT() { return JIT; } + + /// Get a reference to the RPC server used to provide host services. + RPCServerTy &getRPCServer() { + assert(RPCServer && "RPC server not initialized"); + return *RPCServer; + } + + /// Get the OpenMP requires flags set for this plugin. + int64_t getRequiresFlags() const { return RequiresFlags; } + + /// Set the OpenMP requires flags for this plugin. + void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; } + + /// Initialize a device within the plugin. + Error initDevice(int32_t DeviceId); + + /// Deinitialize a device within the plugin and release its resources. + Error deinitDevice(int32_t DeviceId); + + /// Indicate whether data can be exchanged directly between two devices under + /// this same plugin. If this function returns true, it's safe to call the + /// GenericDeviceTy::exchangeData() function on the source device. + virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) { + return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId); + } + + /// Top level interface to verify if a given ELF image can be executed on a + /// given target. Returns true if the \p Image is compatible with the plugin. + Expected<bool> checkELFImage(StringRef Image) const; + + /// Indicate if an image is compatible with the plugin devices. Notice that + /// this function may be called before actually initializing the devices. So + /// we could not move this function into GenericDeviceTy. + virtual Expected<bool> isELFCompatible(StringRef Image) const = 0; + +protected: + /// Indicate whether a device id is valid. + bool isValidDeviceId(int32_t DeviceId) const { + return (DeviceId >= 0 && DeviceId < getNumDevices()); + } + +public: + // TODO: This plugin interface needs to be cleaned up. + + /// Returns non-zero if the provided \p Image can be executed by the runtime. + int32_t is_valid_binary(__tgt_device_image *Image); + + /// Initialize the device inside of the plugin. + int32_t init_device(int32_t DeviceId); + + /// Return the number of devices this plugin can support. + int32_t number_of_devices(); + + /// Initializes the OpenMP register requires information. + int64_t init_requires(int64_t RequiresFlags); + + /// Returns non-zero if the data can be exchanged between the two devices. + int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId); + + /// Initializes the record and replay mechanism inside the plugin. + int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize, + void *VAddr, bool isRecord, bool SaveOutput, + uint64_t &ReqPtrArgOffset); + + /// Loads the associated binary into the plugin and returns a handle to it. + int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage, + __tgt_device_binary *Binary); + + /// Allocates memory that is accessively to the given device. + void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind); + + /// Deallocates memory on the given device. + int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind); + + /// Locks / pins host memory using the plugin runtime. + int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size, + void **LockedPtr); + + /// Unlocks / unpins host memory using the plugin runtime. + int32_t data_unlock(int32_t DeviceId, void *Ptr); + + /// Notify the runtime about a new mapping that has been created outside. + int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size); + + /// Notify t he runtime about a mapping that has been deleted. + int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr); + + /// Copy data to the given device. + int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, + int64_t Size); + + /// Copy data to the given device asynchronously. + int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr, + int64_t Size, __tgt_async_info *AsyncInfoPtr); + + /// Copy data from the given device. + int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, + int64_t Size); + + /// Copy data from the given device asynchornously. + int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr, + int64_t Size, __tgt_async_info *AsyncInfoPtr); + + /// Exchange memory addresses between two devices. + int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId, + void *DstPtr, int64_t Size); + + /// Exchange memory addresses between two devices asynchronously. + int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, + int DstDeviceId, void *DstPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + + /// Begin executing a kernel on the given device. + int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, + ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs, + __tgt_async_info *AsyncInfoPtr); + + /// Synchronize an asyncrhonous queue with the plugin runtime. + int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr); + + /// Query the current state of an asynchronous queue. + int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr); + + /// Prints information about the given devices supported by the plugin. + void print_device_info(int32_t DeviceId); + + /// Creates an event in the given plugin if supported. + int32_t create_event(int32_t DeviceId, void **EventPtr); + + /// Records an event that has occurred. + int32_t record_event(int32_t DeviceId, void *EventPtr, + __tgt_async_info *AsyncInfoPtr); + + /// Wait until an event has occurred. + int32_t wait_event(int32_t DeviceId, void *EventPtr, + __tgt_async_info *AsyncInfoPtr); + + /// Syncrhonize execution until an event is done. + int32_t sync_event(int32_t DeviceId, void *EventPtr); + + /// Remove the event from the plugin. + int32_t destroy_event(int32_t DeviceId, void *EventPtr); + + /// Remove the event from the plugin. + void set_info_flag(uint32_t NewInfoLevel); + + /// Creates an asynchronous queue for the given plugin. + int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr); + + /// Creates device information to be used for diagnostics. + int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo, + const char **ErrStr); + + /// Sets the offset into the devices for use by OMPT. + int32_t set_device_offset(int32_t DeviceIdOffset); + + /// Returns if the plugin can support auotmatic copy. + int32_t use_auto_zero_copy(int32_t DeviceId); + + /// Look up a global symbol in the given binary. + int32_t get_global(__tgt_device_binary Binary, uint64_t Size, + const char *Name, void **DevicePtr); + + /// Look up a kernel function in the given binary. + int32_t get_function(__tgt_device_binary Binary, const char *Name, + void **KernelPtr); + +private: + /// Number of devices available for the plugin. + int32_t NumDevices = 0; + + /// Index offset, which when added to a DeviceId, will yield a unique + /// user-observable device identifier. This is especially important when + /// DeviceIds of multiple plugins / RTLs need to be distinguishable. + int32_t DeviceIdStartIndex = 0; + + /// Array of pointers to the devices. Initially, they are all set to nullptr. + /// Once a device is initialized, the pointer is stored in the position given + /// by its device id. A position with nullptr means that the corresponding + /// device was not initialized yet. + llvm::SmallVector<GenericDeviceTy *> Devices; + + /// OpenMP requires flags. + int64_t RequiresFlags; + + /// Pointer to the global handler for this plugin. + GenericGlobalHandlerTy *GlobalHandler; + + /// Internal allocator for different structures. + BumpPtrAllocator Allocator; + + /// The JIT engine shared by all devices connected to this plugin. + JITEngine JIT; + + /// The interface between the plugin and the GPU for host services. + RPCServerTy *RPCServer; +}; + +namespace Plugin { +/// Create a success error. This is the same as calling Error::success(), but +/// it is recommended to use this one for consistency with Plugin::error() and +/// Plugin::check(). +static Error success() { return Error::success(); } + +/// Create a string error. +template <typename... ArgsTy> +static Error error(const char *ErrFmt, ArgsTy... Args) { + return createStringError(inconvertibleErrorCode(), ErrFmt, Args...); +} + +/// Check the plugin-specific error code and return an error or success +/// accordingly. In case of an error, create a string error with the error +/// description. The ErrFmt should follow the format: +/// "Error in <function name>[<optional info>]: %s" +/// The last format specifier "%s" is mandatory and will be used to place the +/// error code's description. Notice this function should be only called from +/// the plugin-specific code. +/// TODO: Refactor this, must be defined individually by each plugin. +template <typename... ArgsTy> +static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args); +} // namespace Plugin + +/// Class for simplifying the getter operation of the plugin. Anywhere on the +/// code, the current plugin can be retrieved by Plugin::get(). The class also +/// declares functions to create plugin-specific object instances. The check(), +/// createPlugin(), createDevice() and createGlobalHandler() functions should be +/// defined by each plugin implementation. +class PluginTy { + // Reference to the plugin instance. + static GenericPluginTy *SpecificPlugin; + + PluginTy() { + if (auto Err = init()) + REPORT("Failed to initialize plugin: %s\n", + toString(std::move(Err)).data()); + } + + ~PluginTy() { + if (auto Err = deinit()) + REPORT("Failed to deinitialize plugin: %s\n", + toString(std::move(Err)).data()); + } + + PluginTy(const PluginTy &) = delete; + void operator=(const PluginTy &) = delete; + + /// Create and intialize the plugin instance. + static Error init() { + assert(!SpecificPlugin && "Plugin already created"); + + // Create the specific plugin. + SpecificPlugin = createPlugin(); + assert(SpecificPlugin && "Plugin was not created"); + + // Initialize the plugin. + return SpecificPlugin->init(); + } + + // Deinitialize and destroy the plugin instance. + static Error deinit() { + assert(SpecificPlugin && "Plugin no longer valid"); + + for (int32_t DevNo = 0, NumDev = SpecificPlugin->getNumDevices(); + DevNo < NumDev; ++DevNo) + if (auto Err = SpecificPlugin->deinitDevice(DevNo)) + return Err; + + // Deinitialize the plugin. + if (auto Err = SpecificPlugin->deinit()) + return Err; + + // Delete the plugin instance. + delete SpecificPlugin; + + // Invalidate the plugin reference. + SpecificPlugin = nullptr; + + return Plugin::success(); + } + +public: + /// Initialize the plugin if needed. The plugin could have been initialized by + /// a previous call to Plugin::get(). + static Error initIfNeeded() { + // Trigger the initialization if needed. + get(); + + return Error::success(); + } + + /// Get a reference (or create if it was not created) to the plugin instance. + static GenericPluginTy &get() { + // This static variable will initialize the underlying plugin instance in + // case there was no previous explicit initialization. The initialization is + // thread safe. + static PluginTy Plugin; + + assert(SpecificPlugin && "Plugin is not active"); + return *SpecificPlugin; + } + + /// Get a reference to the plugin with a specific plugin-specific type. + template <typename Ty> static Ty &get() { return static_cast<Ty &>(get()); } + + /// Indicate whether the plugin is active. + static bool isActive() { return SpecificPlugin != nullptr; } + + /// Create a plugin instance. + static GenericPluginTy *createPlugin(); +}; + +/// Auxiliary interface class for GenericDeviceResourceManagerTy. This class +/// acts as a reference to a device resource, such as a stream, and requires +/// some basic functions to be implemented. The derived class should define an +/// empty constructor that creates an empty and invalid resource reference. Do +/// not create a new resource on the ctor, but on the create() function instead. +/// +/// The derived class should also define the type HandleTy as the underlying +/// resource handle type. For instance, in a CUDA stream it would be: +/// using HandleTy = CUstream; +struct GenericDeviceResourceRef { + /// Create a new resource and stores a reference. + virtual Error create(GenericDeviceTy &Device) = 0; + + /// Destroy and release the resources pointed by the reference. + virtual Error destroy(GenericDeviceTy &Device) = 0; + +protected: + ~GenericDeviceResourceRef() = default; +}; + +/// Class that implements a resource pool belonging to a device. This class +/// operates with references to the actual resources. These reference must +/// derive from the GenericDeviceResourceRef class and implement the create +/// and destroy virtual functions. +template <typename ResourceRef> class GenericDeviceResourceManagerTy { + using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>; + using ResourceHandleTy = typename ResourceRef::HandleTy; + +public: + /// Create an empty resource pool for a specific device. + GenericDeviceResourceManagerTy(GenericDeviceTy &Device) + : Device(Device), NextAvailable(0) {} + + /// Destroy the resource pool. At this point, the deinit() function should + /// already have been executed so the resource pool should be empty. + virtual ~GenericDeviceResourceManagerTy() { + assert(ResourcePool.empty() && "Resource pool not empty"); + } + + /// Initialize the resource pool. + Error init(uint32_t InitialSize) { + assert(ResourcePool.empty() && "Resource pool already initialized"); + return ResourcePoolTy::resizeResourcePool(InitialSize); + } + + /// Deinitialize the resource pool and delete all resources. This function + /// must be called before the destructor. + virtual Error deinit() { + if (NextAvailable) + DP("Missing %d resources to be returned\n", NextAvailable); + + // TODO: This prevents a bug on libomptarget to make the plugins fail. There + // may be some resources not returned. Do not destroy these ones. + if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable)) + return Err; + + ResourcePool.clear(); + + return Plugin::success(); + } + + /// Get a resource from the pool or create new ones. If the function + /// succeeds, the handle to the resource is saved in \p Handle. + virtual Error getResource(ResourceHandleTy &Handle) { + // Get a resource with an empty resource processor. + return getResourcesImpl(1, &Handle, + [](ResourceHandleTy) { return Plugin::success(); }); + } + + /// Get multiple resources from the pool or create new ones. If the function + /// succeeds, the handles to the resources are saved in \p Handles. + virtual Error getResources(uint32_t Num, ResourceHandleTy *Handles) { + // Get resources with an empty resource processor. + return getResourcesImpl(Num, Handles, + [](ResourceHandleTy) { return Plugin::success(); }); + } + + /// Return resource to the pool. + virtual Error returnResource(ResourceHandleTy Handle) { + // Return a resource with an empty resource processor. + return returnResourceImpl( + Handle, [](ResourceHandleTy) { return Plugin::success(); }); + } + +protected: + /// Get multiple resources from the pool or create new ones. If the function + /// succeeds, the handles to the resources are saved in \p Handles. Also + /// process each of the obtained resources with \p Processor. + template <typename FuncTy> + Error getResourcesImpl(uint32_t Num, ResourceHandleTy *Handles, + FuncTy Processor) { + const std::lock_guard<std::mutex> Lock(Mutex); + + assert(NextAvailable <= ResourcePool.size() && + "Resource pool is corrupted"); + + if (NextAvailable + Num > ResourcePool.size()) + // Double the resource pool or resize it to provide the requested ones. + if (auto Err = ResourcePoolTy::resizeResourcePool( + std::max(NextAvailable * 2, NextAvailable + Num))) + return Err; + + // Save the handles in the output array parameter. + for (uint32_t r = 0; r < Num; ++r) + Handles[r] = ResourcePool[NextAvailable + r]; + + // Process all obtained resources. + for (uint32_t r = 0; r < Num; ++r) + if (auto Err = Processor(Handles[r])) + return Err; + + NextAvailable += Num; + + return Plugin::success(); + } + + /// Return resource to the pool and process the resource with \p Processor. + template <typename FuncTy> + Error returnResourceImpl(ResourceHandleTy Handle, FuncTy Processor) { + const std::lock_guard<std::mutex> Lock(Mutex); + + // Process the returned resource. + if (auto Err = Processor(Handle)) + return Err; + + assert(NextAvailable > 0 && "Resource pool is corrupted"); + ResourcePool[--NextAvailable] = Handle; + + return Plugin::success(); + } + +protected: + /// The resources between \p OldSize and \p NewSize need to be created or + /// destroyed. The mutex is locked when this function is called. + Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) { + assert(OldSize != NewSize && "Resizing to the same size"); + + if (auto Err = Device.setContext()) + return Err; + + if (OldSize < NewSize) { + // Create new resources. + for (uint32_t I = OldSize; I < NewSize; ++I) { + if (auto Err = ResourcePool[I].create(Device)) + return Err; + } + } else { + // Destroy the obsolete resources. + for (uint32_t I = NewSize; I < OldSize; ++I) { + if (auto Err = ResourcePool[I].destroy(Device)) + return Err; + } + } + return Plugin::success(); + } + + /// Increase or decrease the number of resources. This function should + /// be called with the mutex acquired. + Error resizeResourcePool(uint32_t NewSize) { + uint32_t OldSize = ResourcePool.size(); + + // Nothing to do. + if (OldSize == NewSize) + return Plugin::success(); + + if (OldSize < NewSize) { + // Increase the number of resources. + ResourcePool.resize(NewSize); + return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); + } + + // Decrease the number of resources otherwise. + auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); + ResourcePool.resize(NewSize); + + return Err; + } + + /// The device to which the resources belong + GenericDeviceTy &Device; + + /// Mutex for the resource pool. + std::mutex Mutex; + + /// The next available resource in the pool. + uint32_t NextAvailable; + + /// The actual resource pool. + std::deque<ResourceRef> ResourcePool; +}; + +/// A static check on whether or not we support RPC in libomptarget. +bool libomptargetSupportsRPC(); + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h new file mode 100644 index 000000000000..01bf539bcb3f --- /dev/null +++ b/offload/plugins-nextgen/common/include/RPC.h @@ -0,0 +1,69 @@ +//===- RPC.h - Interface for remote procedure calls from the GPU ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides the interface to support remote procedure calls (RPC) from +// the GPU. This is required to implement host services like printf or malloc. +// The interface to the RPC server is provided by the 'libc' project in LLVM. +// For more information visit https://libc.llvm.org/gpu/. +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/Error.h" + +#include <cstdint> + +namespace llvm::omp::target { +namespace plugin { +struct GenericPluginTy; +struct GenericDeviceTy; +class GenericGlobalHandlerTy; +class DeviceImageTy; +} // namespace plugin + +/// A generic class implementing the interface between the RPC server provided +/// by the 'libc' project and 'libomptarget'. If the RPC server is not availible +/// these routines will perform no action. +struct RPCServerTy { +public: + /// Initializes the handles to the number of devices we may need to service. + RPCServerTy(plugin::GenericPluginTy &Plugin); + + /// Check if this device image is using an RPC server. This checks for the + /// precense of an externally visible symbol in the device image that will + /// be present whenever RPC code is called. + llvm::Expected<bool> isDeviceUsingRPC(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image); + + /// Initialize the RPC server for the given device. This will allocate host + /// memory for the internal server and copy the data to the client on the + /// device. The device must be loaded before this is valid. + llvm::Error initDevice(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image); + + /// Runs the RPC server associated with the \p Device until the pending work + /// is cleared. + llvm::Error runServer(plugin::GenericDeviceTy &Device); + + /// Deinitialize the RPC server for the given device. This will free the + /// memory associated with the k + llvm::Error deinitDevice(plugin::GenericDeviceTy &Device); + +private: + /// Array from this device's identifier to its attached devices. + llvm::SmallVector<uintptr_t> Handles; +}; + +} // namespace llvm::omp::target + +#endif diff --git a/offload/plugins-nextgen/common/include/Utils/ELF.h b/offload/plugins-nextgen/common/include/Utils/ELF.h new file mode 100644 index 000000000000..88c83d39b68c --- /dev/null +++ b/offload/plugins-nextgen/common/include/Utils/ELF.h @@ -0,0 +1,44 @@ +//===-- Utils/ELF.h - Common ELF functionality ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Common ELF functionality for target plugins. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H +#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H + +#include "Shared/PluginAPI.h" + +#include "llvm/Object/ELF.h" +#include "llvm/Object/ELFObjectFile.h" + +namespace utils { +namespace elf { + +/// Returns true or false if the \p Buffer is an ELF file. +bool isELF(llvm::StringRef Buffer); + +/// Checks if the given \p Object is a valid ELF matching the e_machine value. +llvm::Expected<bool> checkMachine(llvm::StringRef Object, uint16_t EMachine); + +/// Returns a pointer to the given \p Symbol inside of an ELF object. +llvm::Expected<const void *> +getSymbolAddress(const llvm::object::ELFSymbolRef &Symbol); + +/// Returns the symbol associated with the \p Name in the \p ELFObj. It will +/// first search for the hash sections to identify symbols from the hash table. +/// If that fails it will fall back to a linear search in the case of an +/// executable file without a hash table. +llvm::Expected<std::optional<llvm::object::ELFSymbolRef>> +getSymbol(const llvm::object::ObjectFile &ELFObj, llvm::StringRef Name); + +} // namespace elf +} // namespace utils + +#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H |
