summaryrefslogtreecommitdiff
path: root/offload/plugins-nextgen/common/include
diff options
context:
space:
mode:
Diffstat (limited to 'offload/plugins-nextgen/common/include')
-rw-r--r--offload/plugins-nextgen/common/include/DLWrap.h286
-rw-r--r--offload/plugins-nextgen/common/include/GlobalHandler.h174
-rw-r--r--offload/plugins-nextgen/common/include/JIT.h125
-rw-r--r--offload/plugins-nextgen/common/include/MemoryManager.h347
-rw-r--r--offload/plugins-nextgen/common/include/PluginInterface.h1537
-rw-r--r--offload/plugins-nextgen/common/include/RPC.h69
-rw-r--r--offload/plugins-nextgen/common/include/Utils/ELF.h44
7 files changed, 2582 insertions, 0 deletions
diff --git a/offload/plugins-nextgen/common/include/DLWrap.h b/offload/plugins-nextgen/common/include/DLWrap.h
new file mode 100644
index 000000000000..8934e7e70102
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/DLWrap.h
@@ -0,0 +1,286 @@
+//===-- Shared/DLWrap.h - Convenience wrapper for dlopen/dlsym --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The openmp plugins depend on extern libraries. These can be used via:
+// - bitcode file statically linked
+// - (relocatable) object file statically linked
+// - static library
+// - dynamic library, linked at build time
+// - dynamic library, loaded at application run time by dlopen
+//
+// This file factors out most boilerplate for using a dlopened library.
+// - Function symbols are generated that are statically linked against
+// - The dlopen can be done implicitly when initializing the library
+// - dlsym lookups are done once and cached
+// - The abstraction is very thin to permit varied uses of the library
+//
+// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to:
+// int foo(char x0, double x1, void* x2) {
+// constexpr size_t index = id();
+// void * dlsymResult = pointer(index);
+// return ((int (*)(char, double, void*))dlsymResult)(x0, x1, x2);
+// }
+//
+// Multiple calls to DLWRAP(symbol_name, arity) with bespoke
+// initialization code that can use the thin abstraction:
+// namespace dlwrap {
+// static size_t size();
+// static const char *symbol(size_t);
+// static void **pointer(size_t);
+// }
+// will compile to an object file that only exposes the symbols that the
+// dynamic library would do, with the right function types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_DLWRAP_H
+#define OMPTARGET_SHARED_DLWRAP_H
+
+#include <array>
+#include <cstddef>
+#include <tuple>
+#include <type_traits>
+
+// Where symbol is a function, these expand to some book keeping and an
+// implementation of that function
+#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY)
+#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY)
+
+// For example, given a prototype:
+// int foo(char, double);
+//
+// DLWRAP(foo, 2) expands to:
+//
+// namespace dlwrap {
+// struct foo_Trait : public dlwrap::trait<decltype(&foo)> {
+// using T = dlwrap::trait<decltype(&foo)>;
+// static T::FunctionType get() {
+// constexpr size_t Index = getIndex();
+// void *P = *dlwrap::pointer(Index);
+// return reinterpret_cast<T::FunctionType>(P);
+// }
+// };
+// }
+// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); }
+//
+// DLWRAP_INTERNAL is similar, except the function it expands to is:
+// static int dlwrap_foo(char x0, double x1) { ... }
+// so that the function pointer call can be wrapped in library-specific code
+//
+// DLWRAP_INITIALIZE() declares static functions:
+#define DLWRAP_INITIALIZE() \
+ namespace dlwrap { \
+ static size_t size(); \
+ static const char *symbol(size_t); /* get symbol name in [0, size()) */ \
+ static void ** \
+ pointer(size_t); /* get pointer to function pointer in [0, size()) */ \
+ }
+
+// DLWRAP_FINALIZE() implements the functions from DLWRAP_INITIALIZE
+#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL()
+
+// Implementation details follow.
+
+namespace dlwrap {
+
+// Extract return / argument types from address of function symbol
+template <typename F> struct trait;
+template <typename R, typename... Ts> struct trait<R (*)(Ts...)> {
+ constexpr static const size_t nargs = sizeof...(Ts);
+ typedef R ReturnType;
+ template <size_t i> struct arg {
+ typedef typename std::tuple_element<i, std::tuple<Ts...>>::type type;
+ };
+
+ typedef R (*FunctionType)(Ts...);
+};
+
+namespace type {
+// Book keeping is by type specialization
+
+template <size_t S> struct count {
+ static constexpr size_t N = count<S - 1>::N;
+};
+
+template <> struct count<0> { static constexpr size_t N = 0; };
+
+// Get a constexpr size_t ID, starts at zero
+#define DLWRAP_ID() (dlwrap::type::count<__LINE__>::N)
+
+// Increment value returned by DLWRAP_ID
+#define DLWRAP_INC() \
+ template <> struct dlwrap::type::count<__LINE__> { \
+ static constexpr size_t N = 1 + dlwrap::type::count<__LINE__ - 1>::N; \
+ }
+
+template <size_t N> struct symbol;
+#define DLWRAP_SYMBOL(SYMBOL, ID) \
+ template <> struct dlwrap::type::symbol<ID> { \
+ static constexpr const char *call() { return #SYMBOL; } \
+ }
+} // namespace type
+
+template <size_t N, size_t... Is>
+constexpr std::array<const char *, N> static getSymbolArray(
+ std::index_sequence<Is...>) {
+ return {{dlwrap::type::symbol<Is>::call()...}};
+}
+
+template <size_t Requested, size_t Required> constexpr void verboseAssert() {
+ static_assert(Requested == Required, "Arity Error");
+}
+
+} // namespace dlwrap
+
+#define DLWRAP_INSTANTIATE(SYM_DEF, SYM_USE, ARITY) \
+ DLWRAP_INSTANTIATE_##ARITY(SYM_DEF, SYM_USE, \
+ dlwrap::trait<decltype(&SYM_USE)>)
+
+#define DLWRAP_FINALIZE_IMPL() \
+ static size_t dlwrap::size() { return DLWRAP_ID(); } \
+ static const char *dlwrap::symbol(size_t i) { \
+ static constexpr const std::array<const char *, DLWRAP_ID()> \
+ dlwrap_symbols = getSymbolArray<DLWRAP_ID()>( \
+ std::make_index_sequence<DLWRAP_ID()>()); \
+ return dlwrap_symbols[i]; \
+ } \
+ static void **dlwrap::pointer(size_t i) { \
+ static std::array<void *, DLWRAP_ID()> dlwrap_pointers; \
+ return &dlwrap_pointers.data()[i]; \
+ }
+
+#define DLWRAP_COMMON(SYMBOL, ARITY) \
+ DLWRAP_INC(); \
+ DLWRAP_SYMBOL(SYMBOL, DLWRAP_ID() - 1); \
+ namespace dlwrap { \
+ struct SYMBOL##_Trait : public dlwrap::trait<decltype(&SYMBOL)> { \
+ using T = dlwrap::trait<decltype(&SYMBOL)>; \
+ static T::FunctionType get() { \
+ verboseAssert<ARITY, trait<decltype(&SYMBOL)>::nargs>(); \
+ constexpr size_t Index = DLWRAP_ID() - 1; \
+ void *P = *dlwrap::pointer(Index); \
+ return reinterpret_cast<T::FunctionType>(P); \
+ } \
+ }; \
+ }
+
+#define DLWRAP_IMPL(SYMBOL, ARITY) \
+ DLWRAP_COMMON(SYMBOL, ARITY) \
+ DLWRAP_INSTANTIATE(SYMBOL, SYMBOL, ARITY)
+
+#define DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) \
+ DLWRAP_COMMON(SYMBOL, ARITY) \
+ static DLWRAP_INSTANTIATE(dlwrap_##SYMBOL, SYMBOL, ARITY)
+
+#define DLWRAP_INSTANTIATE_0(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF() { return dlwrap::SYM_USE##_Trait::get()(); }
+#define DLWRAP_INSTANTIATE_1(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0); \
+ }
+#define DLWRAP_INSTANTIATE_2(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1); \
+ }
+#define DLWRAP_INSTANTIATE_3(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2); \
+ }
+#define DLWRAP_INSTANTIATE_4(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3); \
+ }
+#define DLWRAP_INSTANTIATE_5(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4); \
+ }
+#define DLWRAP_INSTANTIATE_6(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5); \
+ }
+
+#define DLWRAP_INSTANTIATE_7(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6); \
+ }
+
+#define DLWRAP_INSTANTIATE_8(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6, \
+ typename T::template arg<7>::type x7) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7); \
+ }
+#define DLWRAP_INSTANTIATE_9(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6, \
+ typename T::template arg<7>::type x7, \
+ typename T::template arg<8>::type x8) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8); \
+ }
+#define DLWRAP_INSTANTIATE_10(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6, \
+ typename T::template arg<7>::type x7, \
+ typename T::template arg<8>::type x8, \
+ typename T::template arg<9>::type x9) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
+ x9); \
+ }
+#define DLWRAP_INSTANTIATE_11(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6, \
+ typename T::template arg<7>::type x7, \
+ typename T::template arg<8>::type x8, \
+ typename T::template arg<9>::type x9, \
+ typename T::template arg<10>::type x10) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
+ x9, x10); \
+ }
+
+#endif // OMPTARGET_SHARED_DLWRAP_H
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
new file mode 100644
index 000000000000..829b4b729119
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -0,0 +1,174 @@
+//===- GlobalHandler.h - Target independent global & enviroment handling --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target independent global handler and environment manager.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
+#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
+
+#include <string>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+#include "Shared/Debug.h"
+#include "Shared/Utils.h"
+
+#include "omptarget.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+class DeviceImageTy;
+struct GenericDeviceTy;
+
+using namespace llvm::object;
+
+/// Common abstraction for globals that live on the host and device.
+/// It simply encapsulates the symbol name, symbol size, and symbol address
+/// (which might be host or device depending on the context).
+class GlobalTy {
+ // NOTE: Maybe we can have a pointer to the offload entry name instead of
+ // holding a private copy of the name as a std::string.
+ std::string Name;
+ uint32_t Size;
+ void *Ptr;
+
+public:
+ GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr)
+ : Name(Name), Size(Size), Ptr(Ptr) {}
+
+ const std::string &getName() const { return Name; }
+ uint32_t getSize() const { return Size; }
+ void *getPtr() const { return Ptr; }
+
+ void setSize(int32_t S) { Size = S; }
+ void setPtr(void *P) { Ptr = P; }
+};
+
+/// Subclass of GlobalTy that holds the memory for a global of \p Ty.
+template <typename Ty> class StaticGlobalTy : public GlobalTy {
+ Ty Data;
+
+public:
+ template <typename... Args>
+ StaticGlobalTy(const std::string &Name, Args &&...args)
+ : GlobalTy(Name, sizeof(Ty), &Data),
+ Data(Ty{std::forward<Args>(args)...}) {}
+
+ template <typename... Args>
+ StaticGlobalTy(const char *Name, Args &&...args)
+ : GlobalTy(Name, sizeof(Ty), &Data),
+ Data(Ty{std::forward<Args>(args)...}) {}
+
+ template <typename... Args>
+ StaticGlobalTy(const char *Name, const char *Suffix, Args &&...args)
+ : GlobalTy(std::string(Name) + Suffix, sizeof(Ty), &Data),
+ Data(Ty{std::forward<Args>(args)...}) {}
+
+ Ty &getValue() { return Data; }
+ const Ty &getValue() const { return Data; }
+ void setValue(const Ty &V) { Data = V; }
+};
+
+/// Helper class to do the heavy lifting when it comes to moving globals between
+/// host and device. Through the GenericDeviceTy we access memcpy DtoH and HtoD,
+/// which means the only things specialized by the subclass is the retrival of
+/// global metadata (size, addr) from the device.
+/// \see getGlobalMetadataFromDevice
+class GenericGlobalHandlerTy {
+ /// Actually move memory between host and device. See readGlobalFromDevice and
+ /// writeGlobalToDevice for the interface description.
+ Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
+ DeviceImageTy &Image,
+ const GlobalTy &HostGlobal,
+ bool Device2Host);
+
+ /// Actually move memory between host and device. See readGlobalFromDevice and
+ /// writeGlobalToDevice for the interface description.
+ Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
+ const GlobalTy &HostGlobal,
+ const GlobalTy &DeviceGlobal,
+ bool Device2Host);
+
+public:
+ virtual ~GenericGlobalHandlerTy() {}
+
+ /// Helper function for getting an ELF from a device image.
+ Expected<std::unique_ptr<ObjectFile>> getELFObjectFile(DeviceImageTy &Image);
+
+ /// Returns whether the symbol named \p SymName is present in the given \p
+ /// Image.
+ bool isSymbolInImage(GenericDeviceTy &Device, DeviceImageTy &Image,
+ StringRef SymName);
+
+ /// Get the address and size of a global in the image. Address and size are
+ /// return in \p ImageGlobal, the global name is passed in \p ImageGlobal.
+ Error getGlobalMetadataFromImage(GenericDeviceTy &Device,
+ DeviceImageTy &Image, GlobalTy &ImageGlobal);
+
+ /// Read the memory associated with a global from the image and store it on
+ /// the host. The name, size, and destination are defined by \p HostGlobal.
+ Error readGlobalFromImage(GenericDeviceTy &Device, DeviceImageTy &Image,
+ const GlobalTy &HostGlobal);
+
+ /// Get the address and size of a global from the device. Address is return in
+ /// \p DeviceGlobal, the global name and expected size are passed in
+ /// \p DeviceGlobal.
+ virtual Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+ DeviceImageTy &Image,
+ GlobalTy &DeviceGlobal) = 0;
+
+ /// Copy the memory associated with a global from the device to its
+ /// counterpart on the host. The name, size, and destination are defined by
+ /// \p HostGlobal. The origin is defined by \p DeviceGlobal.
+ Error readGlobalFromDevice(GenericDeviceTy &Device,
+ const GlobalTy &HostGlobal,
+ const GlobalTy &DeviceGlobal) {
+ return moveGlobalBetweenDeviceAndHost(Device, HostGlobal, DeviceGlobal,
+ /*D2H=*/true);
+ }
+
+ /// Copy the memory associated with a global from the device to its
+ /// counterpart on the host. The name, size, and destination are defined by
+ /// \p HostGlobal. The origin is automatically resolved.
+ Error readGlobalFromDevice(GenericDeviceTy &Device, DeviceImageTy &Image,
+ const GlobalTy &HostGlobal) {
+ return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
+ /*D2H=*/true);
+ }
+
+ /// Copy the memory associated with a global from the host to its counterpart
+ /// on the device. The name, size, and origin are defined by \p HostGlobal.
+ /// The destination is defined by \p DeviceGlobal.
+ Error writeGlobalToDevice(GenericDeviceTy &Device, const GlobalTy &HostGlobal,
+ const GlobalTy &DeviceGlobal) {
+ return moveGlobalBetweenDeviceAndHost(Device, HostGlobal, DeviceGlobal,
+ /*D2H=*/false);
+ }
+
+ /// Copy the memory associated with a global from the host to its counterpart
+ /// on the device. The name, size, and origin are defined by \p HostGlobal.
+ /// The destination is automatically resolved.
+ Error writeGlobalToDevice(GenericDeviceTy &Device, DeviceImageTy &Image,
+ const GlobalTy &HostGlobal) {
+ return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
+ /*D2H=*/false);
+ }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h
new file mode 100644
index 000000000000..b22197b89208
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/JIT.h
@@ -0,0 +1,125 @@
+//===- JIT.h - Target independent JIT infrastructure ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H
+
+#include "Shared/EnvironmentVar.h"
+#include "Shared/Utils.h"
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+
+struct __tgt_device_image;
+
+namespace llvm {
+class MemoryBuffer;
+
+namespace omp {
+namespace target {
+namespace plugin {
+struct GenericDeviceTy;
+} // namespace plugin
+
+/// The JIT infrastructure and caching mechanism.
+struct JITEngine {
+ /// Function type for a callback that will be called after the backend is
+ /// called.
+ using PostProcessingFn =
+ std::function<Expected<std::unique_ptr<MemoryBuffer>>(
+ std::unique_ptr<MemoryBuffer>)>;
+
+ JITEngine(Triple::ArchType TA);
+
+ /// Run jit compilation if \p Image is a bitcode image, otherwise simply
+ /// return \p Image. It is expected to return a memory buffer containing the
+ /// generated device image that could be loaded to the device directly.
+ Expected<const __tgt_device_image *>
+ process(const __tgt_device_image &Image,
+ target::plugin::GenericDeviceTy &Device);
+
+ /// Return true if \p Image is a bitcode image that can be JITed for the given
+ /// architecture.
+ Expected<bool> checkBitcodeImage(StringRef Buffer) const;
+
+private:
+ /// Compile the bitcode image \p Image and generate the binary image that can
+ /// be loaded to the target device of the triple \p Triple architecture \p
+ /// MCpu. \p PostProcessing will be called after codegen to handle cases such
+ /// as assember as an external tool.
+ Expected<const __tgt_device_image *>
+ compile(const __tgt_device_image &Image, const std::string &ComputeUnitKind,
+ PostProcessingFn PostProcessing);
+
+ /// Create or retrieve the object image file from the file system or via
+ /// compilation of the \p Image.
+ Expected<std::unique_ptr<MemoryBuffer>>
+ getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx,
+ const std::string &ComputeUnitKind);
+
+ /// Run backend, which contains optimization and code generation.
+ Expected<std::unique_ptr<MemoryBuffer>>
+ backend(Module &M, const std::string &ComputeUnitKind, unsigned OptLevel);
+
+ /// Run optimization pipeline.
+ void opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M,
+ unsigned OptLevel);
+
+ /// Run code generation.
+ void codegen(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M,
+ raw_pwrite_stream &OS);
+
+ /// The target triple used by the JIT.
+ const Triple TT;
+
+ struct ComputeUnitInfo {
+ /// LLVM Context in which the modules will be constructed.
+ LLVMContext Context;
+
+ /// Output images generated from LLVM backend.
+ SmallVector<std::unique_ptr<MemoryBuffer>, 4> JITImages;
+
+ /// A map of embedded IR images to JITed images.
+ DenseMap<const __tgt_device_image *, __tgt_device_image *> TgtImageMap;
+ };
+
+ /// Map from (march) "CPUs" (e.g., sm_80, or gfx90a), which we call compute
+ /// units as they are not CPUs, to the image information we cached for them.
+ StringMap<ComputeUnitInfo> ComputeUnitMap;
+ std::mutex ComputeUnitMapMutex;
+
+ /// Control environment variables.
+ StringEnvar ReplacementObjectFileName =
+ StringEnvar("LIBOMPTARGET_JIT_REPLACEMENT_OBJECT");
+ StringEnvar ReplacementModuleFileName =
+ StringEnvar("LIBOMPTARGET_JIT_REPLACEMENT_MODULE");
+ StringEnvar PreOptIRModuleFileName =
+ StringEnvar("LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE");
+ StringEnvar PostOptIRModuleFileName =
+ StringEnvar("LIBOMPTARGET_JIT_POST_OPT_IR_MODULE");
+ UInt32Envar JITOptLevel = UInt32Envar("LIBOMPTARGET_JIT_OPT_LEVEL", 3);
+ BoolEnvar JITSkipOpt = BoolEnvar("LIBOMPTARGET_JIT_SKIP_OPT", false);
+};
+
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H
diff --git a/offload/plugins-nextgen/common/include/MemoryManager.h b/offload/plugins-nextgen/common/include/MemoryManager.h
new file mode 100644
index 000000000000..fe1989930b76
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/MemoryManager.h
@@ -0,0 +1,347 @@
+//===----------- MemoryManager.h - Target independent memory manager ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target independent memory manager.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H
+#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H
+
+#include <cassert>
+#include <functional>
+#include <list>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "Shared/Debug.h"
+#include "Shared/Utils.h"
+#include "omptarget.h"
+
+/// Base class of per-device allocator.
+class DeviceAllocatorTy {
+public:
+ virtual ~DeviceAllocatorTy() = default;
+
+ /// Allocate a memory of size \p Size . \p HstPtr is used to assist the
+ /// allocation.
+ virtual void *allocate(size_t Size, void *HstPtr,
+ TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
+
+ /// Delete the pointer \p TgtPtr on the device
+ virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
+};
+
+/// Class of memory manager. The memory manager is per-device by using
+/// per-device allocator. Therefore, each plugin using memory manager should
+/// have an allocator for each device.
+class MemoryManagerTy {
+ static constexpr const size_t BucketSize[] = {
+ 0, 1U << 2, 1U << 3, 1U << 4, 1U << 5, 1U << 6, 1U << 7,
+ 1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13};
+
+ static constexpr const int NumBuckets =
+ sizeof(BucketSize) / sizeof(BucketSize[0]);
+
+ /// Find the previous number that is power of 2 given a number that is not
+ /// power of 2.
+ static size_t floorToPowerOfTwo(size_t Num) {
+ Num |= Num >> 1;
+ Num |= Num >> 2;
+ Num |= Num >> 4;
+ Num |= Num >> 8;
+ Num |= Num >> 16;
+#if INTPTR_MAX == INT64_MAX
+ Num |= Num >> 32;
+#elif INTPTR_MAX == INT32_MAX
+ // Do nothing with 32-bit
+#else
+#error Unsupported architecture
+#endif
+ Num += 1;
+ return Num >> 1;
+ }
+
+ /// Find a suitable bucket
+ static int findBucket(size_t Size) {
+ const size_t F = floorToPowerOfTwo(Size);
+
+ DP("findBucket: Size %zu is floored to %zu.\n", Size, F);
+
+ int L = 0, H = NumBuckets - 1;
+ while (H - L > 1) {
+ int M = (L + H) >> 1;
+ if (BucketSize[M] == F)
+ return M;
+ if (BucketSize[M] > F)
+ H = M - 1;
+ else
+ L = M;
+ }
+
+ assert(L >= 0 && L < NumBuckets && "L is out of range");
+
+ DP("findBucket: Size %zu goes to bucket %d\n", Size, L);
+
+ return L;
+ }
+
+ /// A structure stores the meta data of a target pointer
+ struct NodeTy {
+ /// Memory size
+ const size_t Size;
+ /// Target pointer
+ void *Ptr;
+
+ /// Constructor
+ NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {}
+ };
+
+ /// To make \p NodePtrTy ordered when they're put into \p std::multiset.
+ struct NodeCmpTy {
+ bool operator()(const NodeTy &LHS, const NodeTy &RHS) const {
+ return LHS.Size < RHS.Size;
+ }
+ };
+
+ /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make
+ /// the look up procedure more efficient.
+ using FreeListTy = std::multiset<std::reference_wrapper<NodeTy>, NodeCmpTy>;
+
+ /// A list of \p FreeListTy entries, each of which is a \p std::multiset of
+ /// Nodes whose size is less or equal to a specific bucket size.
+ std::vector<FreeListTy> FreeLists;
+ /// A list of mutex for each \p FreeListTy entry
+ std::vector<std::mutex> FreeListLocks;
+ /// A table to map from a target pointer to its node
+ std::unordered_map<void *, NodeTy> PtrToNodeTable;
+ /// The mutex for the table \p PtrToNodeTable
+ std::mutex MapTableLock;
+
+ /// The reference to a device allocator
+ DeviceAllocatorTy &DeviceAllocator;
+
+ /// The threshold to manage memory using memory manager. If the request size
+ /// is larger than \p SizeThreshold, the allocation will not be managed by the
+ /// memory manager.
+ size_t SizeThreshold = 1U << 13;
+
+ /// Request memory from target device
+ void *allocateOnDevice(size_t Size, void *HstPtr) const {
+ return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE);
+ }
+
+ /// Deallocate data on device
+ int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); }
+
+ /// This function is called when it tries to allocate memory on device but the
+ /// device returns out of memory. It will first free all memory in the
+ /// FreeList and try to allocate again.
+ void *freeAndAllocate(size_t Size, void *HstPtr) {
+ std::vector<void *> RemoveList;
+
+ // Deallocate all memory in FreeList
+ for (int I = 0; I < NumBuckets; ++I) {
+ FreeListTy &List = FreeLists[I];
+ std::lock_guard<std::mutex> Lock(FreeListLocks[I]);
+ if (List.empty())
+ continue;
+ for (const NodeTy &N : List) {
+ deleteOnDevice(N.Ptr);
+ RemoveList.push_back(N.Ptr);
+ }
+ FreeLists[I].clear();
+ }
+
+ // Remove all nodes in the map table which have been released
+ if (!RemoveList.empty()) {
+ std::lock_guard<std::mutex> LG(MapTableLock);
+ for (void *P : RemoveList)
+ PtrToNodeTable.erase(P);
+ }
+
+ // Try allocate memory again
+ return allocateOnDevice(Size, HstPtr);
+ }
+
+ /// The goal is to allocate memory on the device. It first tries to
+ /// allocate directly on the device. If a \p nullptr is returned, it might
+ /// be because the device is OOM. In that case, it will free all unused
+ /// memory and then try again.
+ void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) {
+ void *TgtPtr = allocateOnDevice(Size, HstPtr);
+ // We cannot get memory from the device. It might be due to OOM. Let's
+ // free all memory in FreeLists and try again.
+ if (TgtPtr == nullptr) {
+ DP("Failed to get memory on device. Free all memory in FreeLists and "
+ "try again.\n");
+ TgtPtr = freeAndAllocate(Size, HstPtr);
+ }
+
+ if (TgtPtr == nullptr)
+ DP("Still cannot get memory on device probably because the device is "
+ "OOM.\n");
+
+ return TgtPtr;
+ }
+
+public:
+ /// Constructor. If \p Threshold is non-zero, then the default threshold will
+ /// be overwritten by \p Threshold.
+ MemoryManagerTy(DeviceAllocatorTy &DeviceAllocator, size_t Threshold = 0)
+ : FreeLists(NumBuckets), FreeListLocks(NumBuckets),
+ DeviceAllocator(DeviceAllocator) {
+ if (Threshold)
+ SizeThreshold = Threshold;
+ }
+
+ /// Destructor
+ ~MemoryManagerTy() {
+ for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end();
+ ++Itr) {
+ assert(Itr->second.Ptr && "nullptr in map table");
+ deleteOnDevice(Itr->second.Ptr);
+ }
+ }
+
+ /// Allocate memory of size \p Size from target device. \p HstPtr is used to
+ /// assist the allocation.
+ void *allocate(size_t Size, void *HstPtr) {
+ // If the size is zero, we will not bother the target device. Just return
+ // nullptr directly.
+ if (Size == 0)
+ return nullptr;
+
+ DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n",
+ Size, DPxPTR(HstPtr));
+
+ // If the size is greater than the threshold, allocate it directly from
+ // device.
+ if (Size > SizeThreshold) {
+ DP("%zu is greater than the threshold %zu. Allocate it directly from "
+ "device\n",
+ Size, SizeThreshold);
+ void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+
+ DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr));
+
+ return TgtPtr;
+ }
+
+ NodeTy *NodePtr = nullptr;
+
+ // Try to get a node from FreeList
+ {
+ const int B = findBucket(Size);
+ FreeListTy &List = FreeLists[B];
+
+ NodeTy TempNode(Size, nullptr);
+ std::lock_guard<std::mutex> LG(FreeListLocks[B]);
+ const auto Itr = List.find(TempNode);
+
+ if (Itr != List.end()) {
+ NodePtr = &Itr->get();
+ List.erase(Itr);
+ }
+ }
+
+ if (NodePtr != nullptr)
+ DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr));
+
+ // We cannot find a valid node in FreeLists. Let's allocate on device and
+ // create a node for it.
+ if (NodePtr == nullptr) {
+ DP("Cannot find a node in the FreeLists. Allocate on device.\n");
+ // Allocate one on device
+ void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+
+ if (TgtPtr == nullptr)
+ return nullptr;
+
+ // Create a new node and add it into the map table
+ {
+ std::lock_guard<std::mutex> Guard(MapTableLock);
+ auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr));
+ NodePtr = &Itr.first->second;
+ }
+
+ DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n",
+ DPxPTR(NodePtr), DPxPTR(TgtPtr), Size);
+ }
+
+ assert(NodePtr && "NodePtr should not be nullptr at this point");
+
+ return NodePtr->Ptr;
+ }
+
+ /// Deallocate memory pointed by \p TgtPtr
+ int free(void *TgtPtr) {
+ DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr));
+
+ NodeTy *P = nullptr;
+
+ // Look it up into the table
+ {
+ std::lock_guard<std::mutex> G(MapTableLock);
+ auto Itr = PtrToNodeTable.find(TgtPtr);
+
+ // We don't remove the node from the map table because the map does not
+ // change.
+ if (Itr != PtrToNodeTable.end())
+ P = &Itr->second;
+ }
+
+ // The memory is not managed by the manager
+ if (P == nullptr) {
+ DP("Cannot find its node. Delete it on device directly.\n");
+ return deleteOnDevice(TgtPtr);
+ }
+
+ // Insert the node to the free list
+ const int B = findBucket(P->Size);
+
+ DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B);
+
+ {
+ std::lock_guard<std::mutex> G(FreeListLocks[B]);
+ FreeLists[B].insert(*P);
+ }
+
+ return OFFLOAD_SUCCESS;
+ }
+
+ /// Get the size threshold from the environment variable
+ /// \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD . Returns a <tt>
+ /// std::pair<size_t, bool> </tt> where the first element represents the
+ /// threshold and the second element represents whether user disables memory
+ /// manager explicitly by setting the var to 0. If user doesn't specify
+ /// anything, returns <0, true>.
+ static std::pair<size_t, bool> getSizeThresholdFromEnv() {
+ static UInt32Envar MemoryManagerThreshold(
+ "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD", 0);
+
+ size_t Threshold = MemoryManagerThreshold.get();
+
+ if (MemoryManagerThreshold.isPresent() && Threshold == 0) {
+ DP("Disabled memory manager as user set "
+ "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=0.\n");
+ return std::make_pair(0, false);
+ }
+
+ return std::make_pair(Threshold, true);
+ }
+};
+
+// GCC still cannot handle the static data member like Clang so we still need
+// this part.
+constexpr const size_t MemoryManagerTy::BucketSize[];
+constexpr const int MemoryManagerTy::NumBuckets;
+
+#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
new file mode 100644
index 000000000000..79e8464bfda5
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -0,0 +1,1537 @@
+//===- PluginInterface.h - Target independent plugin device interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <list>
+#include <map>
+#include <shared_mutex>
+#include <vector>
+
+#include "Shared/Debug.h"
+#include "Shared/Environment.h"
+#include "Shared/EnvironmentVar.h"
+#include "Shared/Requirements.h"
+#include "Shared/Utils.h"
+
+#include "GlobalHandler.h"
+#include "JIT.h"
+#include "MemoryManager.h"
+#include "RPC.h"
+#include "omptarget.h"
+
+#ifdef OMPT_SUPPORT
+#include "omp-tools.h"
+#endif
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+
+namespace plugin {
+
+struct GenericPluginTy;
+struct GenericKernelTy;
+struct GenericDeviceTy;
+
+/// Class that wraps the __tgt_async_info to simply its usage. In case the
+/// object is constructed without a valid __tgt_async_info, the object will use
+/// an internal one and will synchronize the current thread with the pending
+/// operations when calling AsyncInfoWrapperTy::finalize(). This latter function
+/// must be called before destroying the wrapper object.
+struct AsyncInfoWrapperTy {
+ AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr);
+
+ ~AsyncInfoWrapperTy() {
+ assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized");
+ }
+
+ /// Get the raw __tgt_async_info pointer.
+ operator __tgt_async_info *() const { return AsyncInfoPtr; }
+
+ /// Indicate whether there is queue.
+ bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); }
+
+ /// Get the queue.
+ template <typename Ty> Ty getQueueAs() {
+ static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
+ "Queue is not of the same size as target type");
+ return static_cast<Ty>(AsyncInfoPtr->Queue);
+ }
+
+ /// Set the queue.
+ template <typename Ty> void setQueueAs(Ty Queue) {
+ static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
+ "Queue is not of the same size as target type");
+ assert(!AsyncInfoPtr->Queue && "Overwriting queue");
+ AsyncInfoPtr->Queue = Queue;
+ }
+
+ /// Synchronize with the __tgt_async_info's pending operations if it's the
+ /// internal async info. The error associated to the aysnchronous operations
+ /// issued in this queue must be provided in \p Err. This function will update
+ /// the error parameter with the result of the synchronization if it was
+ /// actually executed. This function must be called before destroying the
+ /// object and only once.
+ void finalize(Error &Err);
+
+ /// Register \p Ptr as an associated alloction that is freed after
+ /// finalization.
+ void freeAllocationAfterSynchronization(void *Ptr) {
+ AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
+ }
+
+private:
+ GenericDeviceTy &Device;
+ __tgt_async_info LocalAsyncInfo;
+ __tgt_async_info *AsyncInfoPtr;
+};
+
+/// The information level represents the level of a key-value property in the
+/// info tree print (i.e. indentation). The first level should be the default.
+enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
+
+/// Class for storing device information and later be printed. An object of this
+/// type acts as a queue of key-value properties. Each property has a key, a
+/// a value, and an optional unit for the value. For printing purposes, the
+/// information can be classified into several levels. These levels are useful
+/// for defining sections and subsections. Thus, each key-value property also
+/// has an additional field indicating to which level belongs to. Notice that
+/// we use the level to determine the indentation of the key-value property at
+/// printing time. See the enum InfoLevelKind for the list of accepted levels.
+class InfoQueueTy {
+ struct InfoQueueEntryTy {
+ std::string Key;
+ std::string Value;
+ std::string Units;
+ uint64_t Level;
+ };
+
+ std::deque<InfoQueueEntryTy> Queue;
+
+public:
+ /// Add a new info entry to the queue. The entry requires at least a key
+ /// string in \p Key. The value in \p Value is optional and can be any type
+ /// that is representable as a string. The units in \p Units is optional and
+ /// must be a string. The info level is a template parameter that defaults to
+ /// the first level (top level).
+ template <InfoLevelKind L = InfoLevel1, typename T = std::string>
+ void add(const std::string &Key, T Value = T(),
+ const std::string &Units = std::string()) {
+ assert(!Key.empty() && "Invalid info key");
+
+ // Convert the value to a string depending on its type.
+ if constexpr (std::is_same_v<T, bool>)
+ Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
+ else if constexpr (std::is_arithmetic_v<T>)
+ Queue.push_back({Key, std::to_string(Value), Units, L});
+ else
+ Queue.push_back({Key, Value, Units, L});
+ }
+
+ /// Print all info entries added to the queue.
+ void print() const {
+ // We print four spances for each level.
+ constexpr uint64_t IndentSize = 4;
+
+ // Find the maximum key length (level + key) to compute the individual
+ // indentation of each entry.
+ uint64_t MaxKeySize = 0;
+ for (const auto &Entry : Queue) {
+ uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
+ if (KeySize > MaxKeySize)
+ MaxKeySize = KeySize;
+ }
+
+ // Print all info entries.
+ for (const auto &Entry : Queue) {
+ // Compute the indentations for the current entry.
+ uint64_t KeyIndentSize = Entry.Level * IndentSize;
+ uint64_t ValIndentSize =
+ MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
+
+ llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
+ << std::string(ValIndentSize, ' ') << Entry.Value
+ << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
+ }
+ }
+};
+
+/// Class wrapping a __tgt_device_image and its offload entry table on a
+/// specific device. This class is responsible for storing and managing
+/// the offload entries for an image on a device.
+class DeviceImageTy {
+ /// Image identifier within the corresponding device. Notice that this id is
+ /// not unique between different device; they may overlap.
+ int32_t ImageId;
+
+ /// The pointer to the raw __tgt_device_image.
+ const __tgt_device_image *TgtImage;
+ const __tgt_device_image *TgtImageBitcode;
+
+ /// Reference to the device this image is loaded on.
+ GenericDeviceTy &Device;
+
+ /// If this image has any global destructors that much be called.
+ /// FIXME: This is only required because we currently have no invariants
+ /// towards the lifetime of the underlying image. We should either copy
+ /// the image into memory locally or erase the pointers after init.
+ bool PendingGlobalDtors;
+
+public:
+ DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
+ const __tgt_device_image *Image)
+ : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
+ PendingGlobalDtors(false) {
+ assert(TgtImage && "Invalid target image");
+ }
+
+ /// Get the image identifier within the device.
+ int32_t getId() const { return ImageId; }
+
+ /// Get the device that this image is loaded onto.
+ GenericDeviceTy &getDevice() const { return Device; }
+
+ /// Get the pointer to the raw __tgt_device_image.
+ const __tgt_device_image *getTgtImage() const { return TgtImage; }
+
+ void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) {
+ this->TgtImageBitcode = TgtImageBitcode;
+ }
+
+ const __tgt_device_image *getTgtImageBitcode() const {
+ return TgtImageBitcode;
+ }
+
+ /// Get the image starting address.
+ void *getStart() const { return TgtImage->ImageStart; }
+
+ /// Get the image size.
+ size_t getSize() const {
+ return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
+ }
+
+ /// Get a memory buffer reference to the whole image.
+ MemoryBufferRef getMemoryBuffer() const {
+ return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
+ "Image");
+ }
+ /// Accessors to the boolean value
+ bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
+ bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
+};
+
+/// Class implementing common functionalities of offload kernels. Each plugin
+/// should define the specific kernel class, derive from this generic one, and
+/// implement the necessary virtual function members.
+struct GenericKernelTy {
+ /// Construct a kernel with a name and a execution mode.
+ GenericKernelTy(const char *Name)
+ : Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}
+
+ virtual ~GenericKernelTy() {}
+
+ /// Initialize the kernel object from a specific device.
+ Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image);
+ virtual Error initImpl(GenericDeviceTy &GenericDevice,
+ DeviceImageTy &Image) = 0;
+
+ /// Launch the kernel on the specific device. The device must be the same
+ /// one used to initialize the kernel.
+ Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+ ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const;
+ virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ void *Args,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
+
+ /// Get the kernel name.
+ const char *getName() const { return Name; }
+
+ /// Return true if this kernel is a constructor or destructor.
+ bool isCtorOrDtor() const {
+ // TODO: This is not a great solution and should be revisited.
+ return StringRef(Name).ends_with("tor");
+ }
+
+ /// Get the kernel image.
+ DeviceImageTy &getImage() const {
+ assert(ImagePtr && "Kernel is not initialized!");
+ return *ImagePtr;
+ }
+
+ /// Return the kernel environment object for kernel \p Name.
+ const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
+ return KernelEnvironment;
+ }
+
+ /// Return a device pointer to a new kernel launch environment.
+ Expected<KernelLaunchEnvironmentTy *>
+ getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
+ AsyncInfoWrapperTy &AsyncInfo) const;
+
+ /// Indicate whether an execution mode is valid.
+ static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
+ switch (ExecutionMode) {
+ case OMP_TGT_EXEC_MODE_SPMD:
+ case OMP_TGT_EXEC_MODE_GENERIC:
+ case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+ return true;
+ }
+ return false;
+ }
+
+protected:
+ /// Get the execution mode name of the kernel.
+ const char *getExecutionModeName() const {
+ switch (KernelEnvironment.Configuration.ExecMode) {
+ case OMP_TGT_EXEC_MODE_SPMD:
+ return "SPMD";
+ case OMP_TGT_EXEC_MODE_GENERIC:
+ return "Generic";
+ case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+ return "Generic-SPMD";
+ }
+ llvm_unreachable("Unknown execution mode!");
+ }
+
+ /// Prints generic kernel launch information.
+ Error printLaunchInfo(GenericDeviceTy &GenericDevice,
+ KernelArgsTy &KernelArgs, uint32_t NumThreads,
+ uint64_t NumBlocks) const;
+
+ /// Prints plugin-specific kernel launch information after generic kernel
+ /// launch information
+ virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
+ KernelArgsTy &KernelArgs,
+ uint32_t NumThreads,
+ uint64_t NumBlocks) const;
+
+private:
+ /// Prepare the arguments before launching the kernel.
+ void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+ ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
+ llvm::SmallVectorImpl<void *> &Args,
+ llvm::SmallVectorImpl<void *> &Ptrs,
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
+
+ /// Get the number of threads and blocks for the kernel based on the
+ /// user-defined threads and block clauses.
+ uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
+ uint32_t ThreadLimitClause[3]) const;
+
+ /// The number of threads \p NumThreads can be adjusted by this method.
+ /// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
+ /// thread_limit clause.
+ uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
+ uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
+ uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
+
+ /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+ bool isGenericSPMDMode() const {
+ return KernelEnvironment.Configuration.ExecMode ==
+ OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+ }
+ bool isGenericMode() const {
+ return KernelEnvironment.Configuration.ExecMode ==
+ OMP_TGT_EXEC_MODE_GENERIC;
+ }
+ bool isSPMDMode() const {
+ return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
+ }
+
+ /// The kernel name.
+ const char *Name;
+
+ /// The image that contains this kernel.
+ DeviceImageTy *ImagePtr = nullptr;
+
+protected:
+ /// The preferred number of threads to run the kernel.
+ uint32_t PreferredNumThreads;
+
+ /// The maximum number of threads which the kernel could leverage.
+ uint32_t MaxNumThreads;
+
+ /// The kernel environment, including execution flags.
+ KernelEnvironmentTy KernelEnvironment;
+
+ /// The prototype kernel launch environment.
+ KernelLaunchEnvironmentTy KernelLaunchEnvironment;
+
+ /// If the kernel is a bare kernel.
+ bool IsBareKernel = false;
+};
+
+/// Class representing a map of host pinned allocations. We track these pinned
+/// allocations, so memory tranfers invloving these buffers can be optimized.
+class PinnedAllocationMapTy {
+
+ /// Struct representing a map entry.
+ struct EntryTy {
+ /// The host pointer of the pinned allocation.
+ void *HstPtr;
+
+ /// The pointer that devices' driver should use to transfer data from/to the
+ /// pinned allocation. In most plugins, this pointer will be the same as the
+ /// host pointer above.
+ void *DevAccessiblePtr;
+
+ /// The size of the pinned allocation.
+ size_t Size;
+
+ /// Indicate whether the allocation was locked from outside the plugin, for
+ /// instance, from the application. The externally locked allocations are
+ /// not unlocked by the plugin when unregistering the last user.
+ bool ExternallyLocked;
+
+ /// The number of references to the pinned allocation. The allocation should
+ /// remain pinned and registered to the map until the number of references
+ /// becomes zero.
+ mutable size_t References;
+
+ /// Create an entry with the host and device acessible pointers, the buffer
+ /// size, and a boolean indicating whether the buffer was locked externally.
+ EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size,
+ bool ExternallyLocked)
+ : HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size),
+ ExternallyLocked(ExternallyLocked), References(1) {}
+
+ /// Utility constructor used for std::set searches.
+ EntryTy(void *HstPtr)
+ : HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0),
+ ExternallyLocked(false), References(0) {}
+ };
+
+ /// Comparator of mep entries. Use the host pointer to enforce an order
+ /// between entries.
+ struct EntryCmpTy {
+ bool operator()(const EntryTy &Left, const EntryTy &Right) const {
+ return Left.HstPtr < Right.HstPtr;
+ }
+ };
+
+ typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy;
+
+ /// The map of host pinned allocations.
+ PinnedAllocSetTy Allocs;
+
+ /// The mutex to protect accesses to the map.
+ mutable std::shared_mutex Mutex;
+
+ /// Reference to the corresponding device.
+ GenericDeviceTy &Device;
+
+ /// Indicate whether mapped host buffers should be locked automatically.
+ bool LockMappedBuffers;
+
+ /// Indicate whether failures when locking mapped buffers should be ingored.
+ bool IgnoreLockMappedFailures;
+
+ /// Find an allocation that intersects with \p HstPtr pointer. Assume the
+ /// map's mutex is acquired.
+ const EntryTy *findIntersecting(const void *HstPtr) const {
+ if (Allocs.empty())
+ return nullptr;
+
+ // Search the first allocation with starting address that is not less than
+ // the buffer address.
+ auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)});
+
+ // Direct match of starting addresses.
+ if (It != Allocs.end() && It->HstPtr == HstPtr)
+ return &(*It);
+
+ // Not direct match but may be a previous pinned allocation in the map which
+ // contains the buffer. Return false if there is no such a previous
+ // allocation.
+ if (It == Allocs.begin())
+ return nullptr;
+
+ // Move to the previous pinned allocation.
+ --It;
+
+ // The buffer is not contained in the pinned allocation.
+ if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr)
+ return &(*It);
+
+ // None found.
+ return nullptr;
+ }
+
+ /// Insert an entry to the map representing a locked buffer. The number of
+ /// references is set to one.
+ Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size,
+ bool ExternallyLocked = false);
+
+ /// Erase an existing entry from the map.
+ Error eraseEntry(const EntryTy &Entry);
+
+ /// Register a new user into an entry that represents a locked buffer. Check
+ /// also that the registered buffer with \p HstPtr address and \p Size is
+ /// actually contained into the entry.
+ Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size);
+
+ /// Unregister a user from the entry and return whether it is the last user.
+ /// If it is the last user, the entry will have to be removed from the map
+ /// and unlock the entry's host buffer (if necessary).
+ Expected<bool> unregisterEntryUse(const EntryTy &Entry);
+
+ /// Indicate whether the first range A fully contains the second range B.
+ static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
+ void *EndA = advanceVoidPtr(PtrA, SizeA);
+ void *EndB = advanceVoidPtr(PtrB, SizeB);
+ return (PtrB >= PtrA && EndB <= EndA);
+ }
+
+ /// Indicate whether the first range A intersects with the second range B.
+ static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
+ void *EndA = advanceVoidPtr(PtrA, SizeA);
+ void *EndB = advanceVoidPtr(PtrB, SizeB);
+ return (PtrA < EndB && PtrB < EndA);
+ }
+
+public:
+ /// Create the map of pinned allocations corresponding to a specific device.
+ PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) {
+
+ // Envar that indicates whether mapped host buffers should be locked
+ // automatically. The possible values are boolean (on/off) and a special:
+ // off: Mapped host buffers are not locked.
+ // on: Mapped host buffers are locked in a best-effort approach.
+ // Failure to lock the buffers are silent.
+ // mandatory: Mapped host buffers are always locked and failures to lock
+ // a buffer results in a fatal error.
+ StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS",
+ "off");
+
+ bool Enabled;
+ if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) {
+ // Parsed as a boolean value. Enable the feature if necessary.
+ LockMappedBuffers = Enabled;
+ IgnoreLockMappedFailures = true;
+ } else if (OMPX_LockMappedBuffers.get() == "mandatory") {
+ // Enable the feature and failures are fatal.
+ LockMappedBuffers = true;
+ IgnoreLockMappedFailures = false;
+ } else {
+ // Disable by default.
+ DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n",
+ OMPX_LockMappedBuffers.get().data());
+ LockMappedBuffers = false;
+ }
+ }
+
+ /// Register a buffer that was recently allocated as a locked host buffer.
+ /// None of the already registered pinned allocations should intersect with
+ /// this new one. The registration requires the host pointer in \p HstPtr,
+ /// the device accessible pointer in \p DevAccessiblePtr, and the size of the
+ /// allocation in \p Size. The allocation must be unregistered using the
+ /// unregisterHostBuffer function.
+ Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size);
+
+ /// Unregister a host pinned allocation passing the host pointer which was
+ /// previously registered using the registerHostBuffer function. When calling
+ /// this function, the pinned allocation cannot have any other user and will
+ /// not be unlocked by this function.
+ Error unregisterHostBuffer(void *HstPtr);
+
+ /// Lock the host buffer at \p HstPtr or register a new user if it intersects
+ /// with an already existing one. A partial overlapping with extension is not
+ /// allowed. The function returns the device accessible pointer of the pinned
+ /// buffer. The buffer must be unlocked using the unlockHostBuffer function.
+ Expected<void *> lockHostBuffer(void *HstPtr, size_t Size);
+
+ /// Unlock the host buffer at \p HstPtr or unregister a user if other users
+ /// are still using the pinned allocation. If this was the last user, the
+ /// pinned allocation is removed from the map and the memory is unlocked.
+ Error unlockHostBuffer(void *HstPtr);
+
+ /// Lock or register a host buffer that was recently mapped by libomptarget.
+ /// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is
+ /// enabled. Even if not enabled, externally locked buffers are registered
+ /// in order to optimize their transfers.
+ Error lockMappedHostBuffer(void *HstPtr, size_t Size);
+
+ /// Unlock or unregister a host buffer that was unmapped by libomptarget.
+ Error unlockUnmappedHostBuffer(void *HstPtr);
+
+ /// Return the device accessible pointer associated to the host pinned
+ /// allocation which the \p HstPtr belongs, if any. Return null in case the
+ /// \p HstPtr does not belong to any host pinned allocation. The device
+ /// accessible pointer is the one that devices should use for data transfers
+ /// that involve a host pinned buffer.
+ void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const {
+ std::shared_lock<std::shared_mutex> Lock(Mutex);
+
+ // Find the intersecting allocation if any.
+ const EntryTy *Entry = findIntersecting(HstPtr);
+ if (!Entry)
+ return nullptr;
+
+ return advanceVoidPtr(Entry->DevAccessiblePtr,
+ getPtrDiff(HstPtr, Entry->HstPtr));
+ }
+
+ /// Check whether a buffer belongs to a registered host pinned allocation.
+ bool isHostPinnedBuffer(const void *HstPtr) const {
+ std::shared_lock<std::shared_mutex> Lock(Mutex);
+
+ // Return whether there is an intersecting allocation.
+ return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr);
+ }
+};
+
+/// Class implementing common functionalities of offload devices. Each plugin
+/// should define the specific device class, derive from this generic one, and
+/// implement the necessary virtual function members.
+struct GenericDeviceTy : public DeviceAllocatorTy {
+ /// Construct a device with its device id within the plugin, the number of
+ /// devices in the plugin and the grid values for that kind of device.
+ GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
+ const llvm::omp::GV &GridValues);
+
+ /// Get the device identifier within the corresponding plugin. Notice that
+ /// this id is not unique between different plugins; they may overlap.
+ int32_t getDeviceId() const { return DeviceId; }
+
+ /// Set the context of the device if needed, before calling device-specific
+ /// functions. Plugins may implement this function as a no-op if not needed.
+ virtual Error setContext() = 0;
+
+ /// Initialize the device. After this call, the device should be already
+ /// working and ready to accept queries or modifications.
+ Error init(GenericPluginTy &Plugin);
+ virtual Error initImpl(GenericPluginTy &Plugin) = 0;
+
+ /// Deinitialize the device and free all its resources. After this call, the
+ /// device is no longer considered ready, so no queries or modifications are
+ /// allowed.
+ Error deinit(GenericPluginTy &Plugin);
+ virtual Error deinitImpl() = 0;
+
+ /// Load the binary image into the device and return the target table.
+ Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
+ const __tgt_device_image *TgtImage);
+ virtual Expected<DeviceImageTy *>
+ loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
+
+ /// Setup the device environment if needed. Notice this setup may not be run
+ /// on some plugins. By default, it will be executed, but plugins can change
+ /// this behavior by overriding the shouldSetupDeviceEnvironment function.
+ Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
+
+ /// Setup the global device memory pool, if the plugin requires one.
+ Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
+ uint64_t PoolSize);
+
+ // Setup the RPC server for this device if needed. This may not run on some
+ // plugins like the CPU targets. By default, it will not be executed so it is
+ // up to the target to override this using the shouldSetupRPCServer function.
+ Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
+
+ /// Synchronize the current thread with the pending operations on the
+ /// __tgt_async_info structure.
+ Error synchronize(__tgt_async_info *AsyncInfo);
+ virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
+
+ /// Invokes any global constructors on the device if present and is required
+ /// by the target.
+ virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
+ DeviceImageTy &Image) {
+ return Error::success();
+ }
+
+ /// Invokes any global destructors on the device if present and is required
+ /// by the target.
+ virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
+ DeviceImageTy &Image) {
+ return Error::success();
+ }
+
+ /// Query for the completion of the pending operations on the __tgt_async_info
+ /// structure in a non-blocking manner.
+ Error queryAsync(__tgt_async_info *AsyncInfo);
+ virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
+
+ /// Check whether the architecture supports VA management
+ virtual bool supportVAManagement() const { return false; }
+
+ /// Get the total device memory size
+ virtual Error getDeviceMemorySize(uint64_t &DSize);
+
+ /// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
+ /// map it to \p VAddr. The obtained address is stored in \p Addr. At return
+ /// \p RSize contains the actual size which can be equal or larger than the
+ /// requested size.
+ virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
+
+ /// De-allocates device memory and unmaps the virtual address \p VAddr
+ virtual Error memoryVAUnMap(void *VAddr, size_t Size);
+
+ /// Allocate data on the device or involving the device.
+ Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
+
+ /// Deallocate data from the device or involving the device.
+ Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
+
+ /// Pin host memory to optimize transfers and return the device accessible
+ /// pointer that devices should use for memory transfers involving the host
+ /// pinned allocation.
+ Expected<void *> dataLock(void *HstPtr, int64_t Size) {
+ return PinnedAllocs.lockHostBuffer(HstPtr, Size);
+ }
+
+ /// Unpin a host memory buffer that was previously pinned.
+ Error dataUnlock(void *HstPtr) {
+ return PinnedAllocs.unlockHostBuffer(HstPtr);
+ }
+
+ /// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific
+ /// API and return the device accessible pointer.
+ virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0;
+
+ /// Unlock a previously locked host buffer starting at \p HstPtr.
+ virtual Error dataUnlockImpl(void *HstPtr) = 0;
+
+ /// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped
+ /// buffer. This means that libomptarget created a new mapping of that host
+ /// buffer (e.g., because a user OpenMP target map) and the buffer may be used
+ /// as source/destination of memory transfers. We can use this information to
+ /// lock the host buffer and optimize its memory transfers.
+ Error notifyDataMapped(void *HstPtr, int64_t Size) {
+ return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
+ }
+
+ /// Mark the host buffer with address \p HstPtr as unmapped. This means that
+ /// libomptarget removed an existing mapping. If the plugin locked the buffer
+ /// in notifyDataMapped, this function should unlock it.
+ Error notifyDataUnmapped(void *HstPtr) {
+ return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
+ }
+
+ /// Check whether the host buffer with address \p HstPtr is pinned by the
+ /// underlying vendor-specific runtime (if any). Retrieve the host pointer,
+ /// the device accessible pointer and the size of the original pinned buffer.
+ virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
+ void *&BaseDevAccessiblePtr,
+ size_t &BaseSize) const = 0;
+
+ /// Submit data to the device (host to device transfer).
+ Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
+ __tgt_async_info *AsyncInfo);
+ virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+ /// Retrieve data from the device (device to host transfer).
+ Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
+ __tgt_async_info *AsyncInfo);
+ virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+ /// Exchange data between devices (device to device transfer). Calling this
+ /// function is only valid if GenericPlugin::isDataExchangable() passing the
+ /// two devices returns true.
+ Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr,
+ int64_t Size, __tgt_async_info *AsyncInfo);
+ virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+ void *DstPtr, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+ /// Run the kernel associated with \p EntryPtr
+ Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
+ KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
+
+ /// Initialize a __tgt_async_info structure. Related to interop features.
+ Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
+ virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+ /// Initialize a __tgt_device_info structure. Related to interop features.
+ Error initDeviceInfo(__tgt_device_info *DeviceInfo);
+ virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
+
+ /// Create an event.
+ Error createEvent(void **EventPtrStorage);
+ virtual Error createEventImpl(void **EventPtrStorage) = 0;
+
+ /// Destroy an event.
+ Error destroyEvent(void *Event);
+ virtual Error destroyEventImpl(void *EventPtr) = 0;
+
+ /// Start the recording of the event.
+ Error recordEvent(void *Event, __tgt_async_info *AsyncInfo);
+ virtual Error recordEventImpl(void *EventPtr,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+ /// Wait for an event to finish. Notice this wait is asynchronous if the
+ /// __tgt_async_info is not nullptr.
+ Error waitEvent(void *Event, __tgt_async_info *AsyncInfo);
+ virtual Error waitEventImpl(void *EventPtr,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+ /// Synchronize the current thread with the event.
+ Error syncEvent(void *EventPtr);
+ virtual Error syncEventImpl(void *EventPtr) = 0;
+
+ /// Print information about the device.
+ Error printInfo();
+ virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
+
+ /// Getters of the grid values.
+ uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
+ uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
+ uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; }
+ uint32_t getDefaultNumThreads() const {
+ return GridValues.GV_Default_WG_Size;
+ }
+ uint32_t getDefaultNumBlocks() const {
+ return GridValues.GV_Default_Num_Teams;
+ }
+ uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
+ virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
+
+ /// Get target compute unit kind (e.g., sm_80, or gfx908).
+ virtual std::string getComputeUnitKind() const { return "unknown"; }
+
+ /// Post processing after jit backend. The ownership of \p MB will be taken.
+ virtual Expected<std::unique_ptr<MemoryBuffer>>
+ doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const {
+ return std::move(MB);
+ }
+
+ /// The minimum number of threads we use for a low-trip count combined loop.
+ /// Instead of using more threads we increase the outer (block/team)
+ /// parallelism.
+ /// @see OMPX_MinThreadsForLowTripCount
+ virtual uint32_t getMinThreadsForLowTripCountLoop() {
+ return OMPX_MinThreadsForLowTripCount;
+ }
+
+ /// Get the total amount of hardware parallelism supported by the target
+ /// device. This is the total amount of warps or wavefronts that can be
+ /// resident on the device simultaneously.
+ virtual uint64_t getHardwareParallelism() const { return 0; }
+
+ /// Get the RPC server running on this device.
+ RPCServerTy *getRPCServer() const { return RPCServer; }
+
+ /// The number of parallel RPC ports to use on the device. In general, this
+ /// should be roughly equivalent to the amount of hardware parallelism the
+ /// device can support. This is because GPUs in general do not have forward
+ /// progress guarantees, so we minimize thread level dependencies by
+ /// allocating enough space such that each device thread can have a port. This
+ /// is likely overly pessimistic in the average case, but guarantees no
+ /// deadlocks at the cost of memory. This must be overloaded by targets
+ /// expecting to use the RPC server.
+ virtual uint64_t requestedRPCPortCount() const {
+ assert(!shouldSetupRPCServer() && "Default implementation cannot be used");
+ return 0;
+ }
+
+ virtual Error getDeviceStackSize(uint64_t &V) = 0;
+
+ /// Returns true if current plugin architecture is an APU
+ /// and unified_shared_memory was not requested by the program.
+ bool useAutoZeroCopy();
+ virtual bool useAutoZeroCopyImpl() { return false; }
+
+ /// Allocate and construct a kernel object.
+ virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
+
+ /// Reference to the underlying plugin that created this device.
+ GenericPluginTy &Plugin;
+
+private:
+ /// Get and set the stack size and heap size for the device. If not used, the
+ /// plugin can implement the setters as no-op and setting the output
+ /// value to zero for the getters.
+ virtual Error setDeviceStackSize(uint64_t V) = 0;
+ virtual Error getDeviceHeapSize(uint64_t &V) = 0;
+ virtual Error setDeviceHeapSize(uint64_t V) = 0;
+
+ /// Indicate whether the device should setup the device environment. Notice
+ /// that returning false in this function will change the behavior of the
+ /// setupDeviceEnvironment() function.
+ virtual bool shouldSetupDeviceEnvironment() const { return true; }
+
+ /// Indicate whether the device should setup the global device memory pool. If
+ /// false is return the value on the device will be uninitialized.
+ virtual bool shouldSetupDeviceMemoryPool() const { return true; }
+
+ /// Indicate whether or not the device should setup the RPC server. This is
+ /// only necessary for unhosted targets like the GPU.
+ virtual bool shouldSetupRPCServer() const { return false; }
+
+ /// Pointer to the memory manager or nullptr if not available.
+ MemoryManagerTy *MemoryManager;
+
+ /// Environment variables defined by the OpenMP standard.
+ Int32Envar OMP_TeamLimit;
+ Int32Envar OMP_NumTeams;
+ Int32Envar OMP_TeamsThreadLimit;
+
+ /// Environment variables defined by the LLVM OpenMP implementation.
+ Int32Envar OMPX_DebugKind;
+ UInt32Envar OMPX_SharedMemorySize;
+ UInt64Envar OMPX_TargetStackSize;
+ UInt64Envar OMPX_TargetHeapSize;
+
+ /// Environment flag to set the minimum number of threads we use for a
+ /// low-trip count combined loop. Instead of using more threads we increase
+ /// the outer (block/team) parallelism.
+ UInt32Envar OMPX_MinThreadsForLowTripCount =
+ UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
+
+protected:
+ /// Environment variables defined by the LLVM OpenMP implementation
+ /// regarding the initial number of streams and events.
+ UInt32Envar OMPX_InitialNumStreams;
+ UInt32Envar OMPX_InitialNumEvents;
+
+ /// Array of images loaded into the device. Images are automatically
+ /// deallocated by the allocator.
+ llvm::SmallVector<DeviceImageTy *> LoadedImages;
+
+ /// The identifier of the device within the plugin. Notice this is not a
+ /// global device id and is not the device id visible to the OpenMP user.
+ const int32_t DeviceId;
+
+ /// The default grid values used for this device.
+ llvm::omp::GV GridValues;
+
+ /// Enumeration used for representing the current state between two devices
+ /// two devices (both under the same plugin) for the peer access between them.
+ /// The states can be a) PENDING when the state has not been queried and needs
+ /// to be queried, b) AVAILABLE when the peer access is available to be used,
+ /// and c) UNAVAILABLE if the system does not allow it.
+ enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
+
+ /// Array of peer access states with the rest of devices. This means that if
+ /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
+ /// the device I can access device J's memory directly. However, notice this
+ /// does not mean that device J can access device I's memory directly.
+ llvm::SmallVector<PeerAccessState> PeerAccesses;
+ std::mutex PeerAccessesLock;
+
+ /// Map of host pinned allocations used for optimize device transfers.
+ PinnedAllocationMapTy PinnedAllocs;
+
+ /// A pointer to an RPC server instance attached to this device if present.
+ /// This is used to run the RPC server during task synchronization.
+ RPCServerTy *RPCServer;
+
+#ifdef OMPT_SUPPORT
+ /// OMPT callback functions
+#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
+ FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback)
+#undef defineOmptCallback
+
+ /// Internal representation for OMPT device (initialize & finalize)
+ std::atomic<bool> OmptInitialized;
+#endif
+
+private:
+ DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
+ DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
+};
+
+/// Class implementing common functionalities of offload plugins. Each plugin
+/// should define the specific plugin class, derive from this generic one, and
+/// implement the necessary virtual function members.
+struct GenericPluginTy {
+
+ /// Construct a plugin instance.
+ GenericPluginTy(Triple::ArchType TA)
+ : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr), JIT(TA),
+ RPCServer(nullptr) {}
+
+ virtual ~GenericPluginTy() {}
+
+ /// Initialize the plugin.
+ Error init();
+
+ /// Initialize the plugin and return the number of available devices.
+ virtual Expected<int32_t> initImpl() = 0;
+
+ /// Deinitialize the plugin and release the resources.
+ Error deinit();
+ virtual Error deinitImpl() = 0;
+
+ /// Create a new device for the underlying plugin.
+ virtual GenericDeviceTy *createDevice(GenericPluginTy &Plugin,
+ int32_t DeviceID,
+ int32_t NumDevices) = 0;
+
+ /// Create a new global handler for the underlying plugin.
+ virtual GenericGlobalHandlerTy *createGlobalHandler() = 0;
+
+ /// Get the reference to the device with a certain device id.
+ GenericDeviceTy &getDevice(int32_t DeviceId) {
+ assert(isValidDeviceId(DeviceId) && "Invalid device id");
+ assert(Devices[DeviceId] && "Device is unitialized");
+
+ return *Devices[DeviceId];
+ }
+
+ /// Get the number of active devices.
+ int32_t getNumDevices() const { return NumDevices; }
+
+ /// Get the plugin-specific device identifier offset.
+ int32_t getDeviceIdStartIndex() const { return DeviceIdStartIndex; }
+
+ /// Set the plugin-specific device identifier offset.
+ void setDeviceIdStartIndex(int32_t Offset) { DeviceIdStartIndex = Offset; }
+
+ /// Get the ELF code to recognize the binary image of this plugin.
+ virtual uint16_t getMagicElfBits() const = 0;
+
+ /// Get the target triple of this plugin.
+ virtual Triple::ArchType getTripleArch() const = 0;
+
+ /// Allocate a structure using the internal allocator.
+ template <typename Ty> Ty *allocate() {
+ return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
+ }
+
+ /// Get the reference to the global handler of this plugin.
+ GenericGlobalHandlerTy &getGlobalHandler() {
+ assert(GlobalHandler && "Global handler not initialized");
+ return *GlobalHandler;
+ }
+
+ /// Get the reference to the JIT used for all devices connected to this
+ /// plugin.
+ JITEngine &getJIT() { return JIT; }
+
+ /// Get a reference to the RPC server used to provide host services.
+ RPCServerTy &getRPCServer() {
+ assert(RPCServer && "RPC server not initialized");
+ return *RPCServer;
+ }
+
+ /// Get the OpenMP requires flags set for this plugin.
+ int64_t getRequiresFlags() const { return RequiresFlags; }
+
+ /// Set the OpenMP requires flags for this plugin.
+ void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; }
+
+ /// Initialize a device within the plugin.
+ Error initDevice(int32_t DeviceId);
+
+ /// Deinitialize a device within the plugin and release its resources.
+ Error deinitDevice(int32_t DeviceId);
+
+ /// Indicate whether data can be exchanged directly between two devices under
+ /// this same plugin. If this function returns true, it's safe to call the
+ /// GenericDeviceTy::exchangeData() function on the source device.
+ virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
+ return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId);
+ }
+
+ /// Top level interface to verify if a given ELF image can be executed on a
+ /// given target. Returns true if the \p Image is compatible with the plugin.
+ Expected<bool> checkELFImage(StringRef Image) const;
+
+ /// Indicate if an image is compatible with the plugin devices. Notice that
+ /// this function may be called before actually initializing the devices. So
+ /// we could not move this function into GenericDeviceTy.
+ virtual Expected<bool> isELFCompatible(StringRef Image) const = 0;
+
+protected:
+ /// Indicate whether a device id is valid.
+ bool isValidDeviceId(int32_t DeviceId) const {
+ return (DeviceId >= 0 && DeviceId < getNumDevices());
+ }
+
+public:
+ // TODO: This plugin interface needs to be cleaned up.
+
+ /// Returns non-zero if the provided \p Image can be executed by the runtime.
+ int32_t is_valid_binary(__tgt_device_image *Image);
+
+ /// Initialize the device inside of the plugin.
+ int32_t init_device(int32_t DeviceId);
+
+ /// Return the number of devices this plugin can support.
+ int32_t number_of_devices();
+
+ /// Initializes the OpenMP register requires information.
+ int64_t init_requires(int64_t RequiresFlags);
+
+ /// Returns non-zero if the data can be exchanged between the two devices.
+ int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId);
+
+ /// Initializes the record and replay mechanism inside the plugin.
+ int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
+ void *VAddr, bool isRecord, bool SaveOutput,
+ uint64_t &ReqPtrArgOffset);
+
+ /// Loads the associated binary into the plugin and returns a handle to it.
+ int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
+ __tgt_device_binary *Binary);
+
+ /// Allocates memory that is accessively to the given device.
+ void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind);
+
+ /// Deallocates memory on the given device.
+ int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind);
+
+ /// Locks / pins host memory using the plugin runtime.
+ int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
+ void **LockedPtr);
+
+ /// Unlocks / unpins host memory using the plugin runtime.
+ int32_t data_unlock(int32_t DeviceId, void *Ptr);
+
+ /// Notify the runtime about a new mapping that has been created outside.
+ int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size);
+
+ /// Notify t he runtime about a mapping that has been deleted.
+ int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
+
+ /// Copy data to the given device.
+ int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
+ int64_t Size);
+
+ /// Copy data to the given device asynchronously.
+ int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr,
+ int64_t Size, __tgt_async_info *AsyncInfoPtr);
+
+ /// Copy data from the given device.
+ int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
+ int64_t Size);
+
+ /// Copy data from the given device asynchornously.
+ int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
+ int64_t Size, __tgt_async_info *AsyncInfoPtr);
+
+ /// Exchange memory addresses between two devices.
+ int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId,
+ void *DstPtr, int64_t Size);
+
+ /// Exchange memory addresses between two devices asynchronously.
+ int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
+ int DstDeviceId, void *DstPtr, int64_t Size,
+ __tgt_async_info *AsyncInfo);
+
+ /// Begin executing a kernel on the given device.
+ int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
+ ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
+ __tgt_async_info *AsyncInfoPtr);
+
+ /// Synchronize an asyncrhonous queue with the plugin runtime.
+ int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
+
+ /// Query the current state of an asynchronous queue.
+ int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
+
+ /// Prints information about the given devices supported by the plugin.
+ void print_device_info(int32_t DeviceId);
+
+ /// Creates an event in the given plugin if supported.
+ int32_t create_event(int32_t DeviceId, void **EventPtr);
+
+ /// Records an event that has occurred.
+ int32_t record_event(int32_t DeviceId, void *EventPtr,
+ __tgt_async_info *AsyncInfoPtr);
+
+ /// Wait until an event has occurred.
+ int32_t wait_event(int32_t DeviceId, void *EventPtr,
+ __tgt_async_info *AsyncInfoPtr);
+
+ /// Syncrhonize execution until an event is done.
+ int32_t sync_event(int32_t DeviceId, void *EventPtr);
+
+ /// Remove the event from the plugin.
+ int32_t destroy_event(int32_t DeviceId, void *EventPtr);
+
+ /// Remove the event from the plugin.
+ void set_info_flag(uint32_t NewInfoLevel);
+
+ /// Creates an asynchronous queue for the given plugin.
+ int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr);
+
+ /// Creates device information to be used for diagnostics.
+ int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo,
+ const char **ErrStr);
+
+ /// Sets the offset into the devices for use by OMPT.
+ int32_t set_device_offset(int32_t DeviceIdOffset);
+
+ /// Returns if the plugin can support auotmatic copy.
+ int32_t use_auto_zero_copy(int32_t DeviceId);
+
+ /// Look up a global symbol in the given binary.
+ int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
+ const char *Name, void **DevicePtr);
+
+ /// Look up a kernel function in the given binary.
+ int32_t get_function(__tgt_device_binary Binary, const char *Name,
+ void **KernelPtr);
+
+private:
+ /// Number of devices available for the plugin.
+ int32_t NumDevices = 0;
+
+ /// Index offset, which when added to a DeviceId, will yield a unique
+ /// user-observable device identifier. This is especially important when
+ /// DeviceIds of multiple plugins / RTLs need to be distinguishable.
+ int32_t DeviceIdStartIndex = 0;
+
+ /// Array of pointers to the devices. Initially, they are all set to nullptr.
+ /// Once a device is initialized, the pointer is stored in the position given
+ /// by its device id. A position with nullptr means that the corresponding
+ /// device was not initialized yet.
+ llvm::SmallVector<GenericDeviceTy *> Devices;
+
+ /// OpenMP requires flags.
+ int64_t RequiresFlags;
+
+ /// Pointer to the global handler for this plugin.
+ GenericGlobalHandlerTy *GlobalHandler;
+
+ /// Internal allocator for different structures.
+ BumpPtrAllocator Allocator;
+
+ /// The JIT engine shared by all devices connected to this plugin.
+ JITEngine JIT;
+
+ /// The interface between the plugin and the GPU for host services.
+ RPCServerTy *RPCServer;
+};
+
+namespace Plugin {
+/// Create a success error. This is the same as calling Error::success(), but
+/// it is recommended to use this one for consistency with Plugin::error() and
+/// Plugin::check().
+static Error success() { return Error::success(); }
+
+/// Create a string error.
+template <typename... ArgsTy>
+static Error error(const char *ErrFmt, ArgsTy... Args) {
+ return createStringError(inconvertibleErrorCode(), ErrFmt, Args...);
+}
+
+/// Check the plugin-specific error code and return an error or success
+/// accordingly. In case of an error, create a string error with the error
+/// description. The ErrFmt should follow the format:
+/// "Error in <function name>[<optional info>]: %s"
+/// The last format specifier "%s" is mandatory and will be used to place the
+/// error code's description. Notice this function should be only called from
+/// the plugin-specific code.
+/// TODO: Refactor this, must be defined individually by each plugin.
+template <typename... ArgsTy>
+static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args);
+} // namespace Plugin
+
+/// Class for simplifying the getter operation of the plugin. Anywhere on the
+/// code, the current plugin can be retrieved by Plugin::get(). The class also
+/// declares functions to create plugin-specific object instances. The check(),
+/// createPlugin(), createDevice() and createGlobalHandler() functions should be
+/// defined by each plugin implementation.
+class PluginTy {
+ // Reference to the plugin instance.
+ static GenericPluginTy *SpecificPlugin;
+
+ PluginTy() {
+ if (auto Err = init())
+ REPORT("Failed to initialize plugin: %s\n",
+ toString(std::move(Err)).data());
+ }
+
+ ~PluginTy() {
+ if (auto Err = deinit())
+ REPORT("Failed to deinitialize plugin: %s\n",
+ toString(std::move(Err)).data());
+ }
+
+ PluginTy(const PluginTy &) = delete;
+ void operator=(const PluginTy &) = delete;
+
+ /// Create and intialize the plugin instance.
+ static Error init() {
+ assert(!SpecificPlugin && "Plugin already created");
+
+ // Create the specific plugin.
+ SpecificPlugin = createPlugin();
+ assert(SpecificPlugin && "Plugin was not created");
+
+ // Initialize the plugin.
+ return SpecificPlugin->init();
+ }
+
+ // Deinitialize and destroy the plugin instance.
+ static Error deinit() {
+ assert(SpecificPlugin && "Plugin no longer valid");
+
+ for (int32_t DevNo = 0, NumDev = SpecificPlugin->getNumDevices();
+ DevNo < NumDev; ++DevNo)
+ if (auto Err = SpecificPlugin->deinitDevice(DevNo))
+ return Err;
+
+ // Deinitialize the plugin.
+ if (auto Err = SpecificPlugin->deinit())
+ return Err;
+
+ // Delete the plugin instance.
+ delete SpecificPlugin;
+
+ // Invalidate the plugin reference.
+ SpecificPlugin = nullptr;
+
+ return Plugin::success();
+ }
+
+public:
+ /// Initialize the plugin if needed. The plugin could have been initialized by
+ /// a previous call to Plugin::get().
+ static Error initIfNeeded() {
+ // Trigger the initialization if needed.
+ get();
+
+ return Error::success();
+ }
+
+ /// Get a reference (or create if it was not created) to the plugin instance.
+ static GenericPluginTy &get() {
+ // This static variable will initialize the underlying plugin instance in
+ // case there was no previous explicit initialization. The initialization is
+ // thread safe.
+ static PluginTy Plugin;
+
+ assert(SpecificPlugin && "Plugin is not active");
+ return *SpecificPlugin;
+ }
+
+ /// Get a reference to the plugin with a specific plugin-specific type.
+ template <typename Ty> static Ty &get() { return static_cast<Ty &>(get()); }
+
+ /// Indicate whether the plugin is active.
+ static bool isActive() { return SpecificPlugin != nullptr; }
+
+ /// Create a plugin instance.
+ static GenericPluginTy *createPlugin();
+};
+
+/// Auxiliary interface class for GenericDeviceResourceManagerTy. This class
+/// acts as a reference to a device resource, such as a stream, and requires
+/// some basic functions to be implemented. The derived class should define an
+/// empty constructor that creates an empty and invalid resource reference. Do
+/// not create a new resource on the ctor, but on the create() function instead.
+///
+/// The derived class should also define the type HandleTy as the underlying
+/// resource handle type. For instance, in a CUDA stream it would be:
+/// using HandleTy = CUstream;
+struct GenericDeviceResourceRef {
+ /// Create a new resource and stores a reference.
+ virtual Error create(GenericDeviceTy &Device) = 0;
+
+ /// Destroy and release the resources pointed by the reference.
+ virtual Error destroy(GenericDeviceTy &Device) = 0;
+
+protected:
+ ~GenericDeviceResourceRef() = default;
+};
+
+/// Class that implements a resource pool belonging to a device. This class
+/// operates with references to the actual resources. These reference must
+/// derive from the GenericDeviceResourceRef class and implement the create
+/// and destroy virtual functions.
+template <typename ResourceRef> class GenericDeviceResourceManagerTy {
+ using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
+ using ResourceHandleTy = typename ResourceRef::HandleTy;
+
+public:
+ /// Create an empty resource pool for a specific device.
+ GenericDeviceResourceManagerTy(GenericDeviceTy &Device)
+ : Device(Device), NextAvailable(0) {}
+
+ /// Destroy the resource pool. At this point, the deinit() function should
+ /// already have been executed so the resource pool should be empty.
+ virtual ~GenericDeviceResourceManagerTy() {
+ assert(ResourcePool.empty() && "Resource pool not empty");
+ }
+
+ /// Initialize the resource pool.
+ Error init(uint32_t InitialSize) {
+ assert(ResourcePool.empty() && "Resource pool already initialized");
+ return ResourcePoolTy::resizeResourcePool(InitialSize);
+ }
+
+ /// Deinitialize the resource pool and delete all resources. This function
+ /// must be called before the destructor.
+ virtual Error deinit() {
+ if (NextAvailable)
+ DP("Missing %d resources to be returned\n", NextAvailable);
+
+ // TODO: This prevents a bug on libomptarget to make the plugins fail. There
+ // may be some resources not returned. Do not destroy these ones.
+ if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable))
+ return Err;
+
+ ResourcePool.clear();
+
+ return Plugin::success();
+ }
+
+ /// Get a resource from the pool or create new ones. If the function
+ /// succeeds, the handle to the resource is saved in \p Handle.
+ virtual Error getResource(ResourceHandleTy &Handle) {
+ // Get a resource with an empty resource processor.
+ return getResourcesImpl(1, &Handle,
+ [](ResourceHandleTy) { return Plugin::success(); });
+ }
+
+ /// Get multiple resources from the pool or create new ones. If the function
+ /// succeeds, the handles to the resources are saved in \p Handles.
+ virtual Error getResources(uint32_t Num, ResourceHandleTy *Handles) {
+ // Get resources with an empty resource processor.
+ return getResourcesImpl(Num, Handles,
+ [](ResourceHandleTy) { return Plugin::success(); });
+ }
+
+ /// Return resource to the pool.
+ virtual Error returnResource(ResourceHandleTy Handle) {
+ // Return a resource with an empty resource processor.
+ return returnResourceImpl(
+ Handle, [](ResourceHandleTy) { return Plugin::success(); });
+ }
+
+protected:
+ /// Get multiple resources from the pool or create new ones. If the function
+ /// succeeds, the handles to the resources are saved in \p Handles. Also
+ /// process each of the obtained resources with \p Processor.
+ template <typename FuncTy>
+ Error getResourcesImpl(uint32_t Num, ResourceHandleTy *Handles,
+ FuncTy Processor) {
+ const std::lock_guard<std::mutex> Lock(Mutex);
+
+ assert(NextAvailable <= ResourcePool.size() &&
+ "Resource pool is corrupted");
+
+ if (NextAvailable + Num > ResourcePool.size())
+ // Double the resource pool or resize it to provide the requested ones.
+ if (auto Err = ResourcePoolTy::resizeResourcePool(
+ std::max(NextAvailable * 2, NextAvailable + Num)))
+ return Err;
+
+ // Save the handles in the output array parameter.
+ for (uint32_t r = 0; r < Num; ++r)
+ Handles[r] = ResourcePool[NextAvailable + r];
+
+ // Process all obtained resources.
+ for (uint32_t r = 0; r < Num; ++r)
+ if (auto Err = Processor(Handles[r]))
+ return Err;
+
+ NextAvailable += Num;
+
+ return Plugin::success();
+ }
+
+ /// Return resource to the pool and process the resource with \p Processor.
+ template <typename FuncTy>
+ Error returnResourceImpl(ResourceHandleTy Handle, FuncTy Processor) {
+ const std::lock_guard<std::mutex> Lock(Mutex);
+
+ // Process the returned resource.
+ if (auto Err = Processor(Handle))
+ return Err;
+
+ assert(NextAvailable > 0 && "Resource pool is corrupted");
+ ResourcePool[--NextAvailable] = Handle;
+
+ return Plugin::success();
+ }
+
+protected:
+ /// The resources between \p OldSize and \p NewSize need to be created or
+ /// destroyed. The mutex is locked when this function is called.
+ Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) {
+ assert(OldSize != NewSize && "Resizing to the same size");
+
+ if (auto Err = Device.setContext())
+ return Err;
+
+ if (OldSize < NewSize) {
+ // Create new resources.
+ for (uint32_t I = OldSize; I < NewSize; ++I) {
+ if (auto Err = ResourcePool[I].create(Device))
+ return Err;
+ }
+ } else {
+ // Destroy the obsolete resources.
+ for (uint32_t I = NewSize; I < OldSize; ++I) {
+ if (auto Err = ResourcePool[I].destroy(Device))
+ return Err;
+ }
+ }
+ return Plugin::success();
+ }
+
+ /// Increase or decrease the number of resources. This function should
+ /// be called with the mutex acquired.
+ Error resizeResourcePool(uint32_t NewSize) {
+ uint32_t OldSize = ResourcePool.size();
+
+ // Nothing to do.
+ if (OldSize == NewSize)
+ return Plugin::success();
+
+ if (OldSize < NewSize) {
+ // Increase the number of resources.
+ ResourcePool.resize(NewSize);
+ return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
+ }
+
+ // Decrease the number of resources otherwise.
+ auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
+ ResourcePool.resize(NewSize);
+
+ return Err;
+ }
+
+ /// The device to which the resources belong
+ GenericDeviceTy &Device;
+
+ /// Mutex for the resource pool.
+ std::mutex Mutex;
+
+ /// The next available resource in the pool.
+ uint32_t NextAvailable;
+
+ /// The actual resource pool.
+ std::deque<ResourceRef> ResourcePool;
+};
+
+/// A static check on whether or not we support RPC in libomptarget.
+bool libomptargetSupportsRPC();
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H
diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h
new file mode 100644
index 000000000000..01bf539bcb3f
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/RPC.h
@@ -0,0 +1,69 @@
+//===- RPC.h - Interface for remote procedure calls from the GPU ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface to support remote procedure calls (RPC) from
+// the GPU. This is required to implement host services like printf or malloc.
+// The interface to the RPC server is provided by the 'libc' project in LLVM.
+// For more information visit https://libc.llvm.org/gpu/.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Error.h"
+
+#include <cstdint>
+
+namespace llvm::omp::target {
+namespace plugin {
+struct GenericPluginTy;
+struct GenericDeviceTy;
+class GenericGlobalHandlerTy;
+class DeviceImageTy;
+} // namespace plugin
+
+/// A generic class implementing the interface between the RPC server provided
+/// by the 'libc' project and 'libomptarget'. If the RPC server is not availible
+/// these routines will perform no action.
+struct RPCServerTy {
+public:
+ /// Initializes the handles to the number of devices we may need to service.
+ RPCServerTy(plugin::GenericPluginTy &Plugin);
+
+ /// Check if this device image is using an RPC server. This checks for the
+ /// precense of an externally visible symbol in the device image that will
+ /// be present whenever RPC code is called.
+ llvm::Expected<bool> isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
+ plugin::GenericGlobalHandlerTy &Handler,
+ plugin::DeviceImageTy &Image);
+
+ /// Initialize the RPC server for the given device. This will allocate host
+ /// memory for the internal server and copy the data to the client on the
+ /// device. The device must be loaded before this is valid.
+ llvm::Error initDevice(plugin::GenericDeviceTy &Device,
+ plugin::GenericGlobalHandlerTy &Handler,
+ plugin::DeviceImageTy &Image);
+
+ /// Runs the RPC server associated with the \p Device until the pending work
+ /// is cleared.
+ llvm::Error runServer(plugin::GenericDeviceTy &Device);
+
+ /// Deinitialize the RPC server for the given device. This will free the
+ /// memory associated with the k
+ llvm::Error deinitDevice(plugin::GenericDeviceTy &Device);
+
+private:
+ /// Array from this device's identifier to its attached devices.
+ llvm::SmallVector<uintptr_t> Handles;
+};
+
+} // namespace llvm::omp::target
+
+#endif
diff --git a/offload/plugins-nextgen/common/include/Utils/ELF.h b/offload/plugins-nextgen/common/include/Utils/ELF.h
new file mode 100644
index 000000000000..88c83d39b68c
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/Utils/ELF.h
@@ -0,0 +1,44 @@
+//===-- Utils/ELF.h - Common ELF functionality ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Common ELF functionality for target plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H
+#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H
+
+#include "Shared/PluginAPI.h"
+
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+namespace utils {
+namespace elf {
+
+/// Returns true or false if the \p Buffer is an ELF file.
+bool isELF(llvm::StringRef Buffer);
+
+/// Checks if the given \p Object is a valid ELF matching the e_machine value.
+llvm::Expected<bool> checkMachine(llvm::StringRef Object, uint16_t EMachine);
+
+/// Returns a pointer to the given \p Symbol inside of an ELF object.
+llvm::Expected<const void *>
+getSymbolAddress(const llvm::object::ELFSymbolRef &Symbol);
+
+/// Returns the symbol associated with the \p Name in the \p ELFObj. It will
+/// first search for the hash sections to identify symbols from the hash table.
+/// If that fails it will fall back to a linear search in the case of an
+/// executable file without a hash table.
+llvm::Expected<std::optional<llvm::object::ELFSymbolRef>>
+getSymbol(const llvm::object::ObjectFile &ELFObj, llvm::StringRef Name);
+
+} // namespace elf
+} // namespace utils
+
+#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H