diff options
Diffstat (limited to 'offload/plugins-nextgen/common/include/PluginInterface.h')
| -rw-r--r-- | offload/plugins-nextgen/common/include/PluginInterface.h | 1537 |
1 files changed, 1537 insertions, 0 deletions
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h new file mode 100644 index 000000000000..79e8464bfda5 --- /dev/null +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -0,0 +1,1537 @@ +//===- PluginInterface.h - Target independent plugin device interface -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H + +#include <cstddef> +#include <cstdint> +#include <deque> +#include <list> +#include <map> +#include <shared_mutex> +#include <vector> + +#include "Shared/Debug.h" +#include "Shared/Environment.h" +#include "Shared/EnvironmentVar.h" +#include "Shared/Requirements.h" +#include "Shared/Utils.h" + +#include "GlobalHandler.h" +#include "JIT.h" +#include "MemoryManager.h" +#include "RPC.h" +#include "omptarget.h" + +#ifdef OMPT_SUPPORT +#include "omp-tools.h" +#endif + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MemoryBufferRef.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TargetParser/Triple.h" + +namespace llvm { +namespace omp { +namespace target { + +namespace plugin { + +struct GenericPluginTy; +struct GenericKernelTy; +struct GenericDeviceTy; + +/// Class that wraps the __tgt_async_info to simply its usage. In case the +/// object is constructed without a valid __tgt_async_info, the object will use +/// an internal one and will synchronize the current thread with the pending +/// operations when calling AsyncInfoWrapperTy::finalize(). This latter function +/// must be called before destroying the wrapper object. +struct AsyncInfoWrapperTy { + AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr); + + ~AsyncInfoWrapperTy() { + assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized"); + } + + /// Get the raw __tgt_async_info pointer. + operator __tgt_async_info *() const { return AsyncInfoPtr; } + + /// Indicate whether there is queue. + bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); } + + /// Get the queue. + template <typename Ty> Ty getQueueAs() { + static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue), + "Queue is not of the same size as target type"); + return static_cast<Ty>(AsyncInfoPtr->Queue); + } + + /// Set the queue. + template <typename Ty> void setQueueAs(Ty Queue) { + static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue), + "Queue is not of the same size as target type"); + assert(!AsyncInfoPtr->Queue && "Overwriting queue"); + AsyncInfoPtr->Queue = Queue; + } + + /// Synchronize with the __tgt_async_info's pending operations if it's the + /// internal async info. The error associated to the aysnchronous operations + /// issued in this queue must be provided in \p Err. This function will update + /// the error parameter with the result of the synchronization if it was + /// actually executed. This function must be called before destroying the + /// object and only once. + void finalize(Error &Err); + + /// Register \p Ptr as an associated alloction that is freed after + /// finalization. + void freeAllocationAfterSynchronization(void *Ptr) { + AsyncInfoPtr->AssociatedAllocations.push_back(Ptr); + } + +private: + GenericDeviceTy &Device; + __tgt_async_info LocalAsyncInfo; + __tgt_async_info *AsyncInfoPtr; +}; + +/// The information level represents the level of a key-value property in the +/// info tree print (i.e. indentation). The first level should be the default. +enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 }; + +/// Class for storing device information and later be printed. An object of this +/// type acts as a queue of key-value properties. Each property has a key, a +/// a value, and an optional unit for the value. For printing purposes, the +/// information can be classified into several levels. These levels are useful +/// for defining sections and subsections. Thus, each key-value property also +/// has an additional field indicating to which level belongs to. Notice that +/// we use the level to determine the indentation of the key-value property at +/// printing time. See the enum InfoLevelKind for the list of accepted levels. +class InfoQueueTy { + struct InfoQueueEntryTy { + std::string Key; + std::string Value; + std::string Units; + uint64_t Level; + }; + + std::deque<InfoQueueEntryTy> Queue; + +public: + /// Add a new info entry to the queue. The entry requires at least a key + /// string in \p Key. The value in \p Value is optional and can be any type + /// that is representable as a string. The units in \p Units is optional and + /// must be a string. The info level is a template parameter that defaults to + /// the first level (top level). + template <InfoLevelKind L = InfoLevel1, typename T = std::string> + void add(const std::string &Key, T Value = T(), + const std::string &Units = std::string()) { + assert(!Key.empty() && "Invalid info key"); + + // Convert the value to a string depending on its type. + if constexpr (std::is_same_v<T, bool>) + Queue.push_back({Key, Value ? "Yes" : "No", Units, L}); + else if constexpr (std::is_arithmetic_v<T>) + Queue.push_back({Key, std::to_string(Value), Units, L}); + else + Queue.push_back({Key, Value, Units, L}); + } + + /// Print all info entries added to the queue. + void print() const { + // We print four spances for each level. + constexpr uint64_t IndentSize = 4; + + // Find the maximum key length (level + key) to compute the individual + // indentation of each entry. + uint64_t MaxKeySize = 0; + for (const auto &Entry : Queue) { + uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize; + if (KeySize > MaxKeySize) + MaxKeySize = KeySize; + } + + // Print all info entries. + for (const auto &Entry : Queue) { + // Compute the indentations for the current entry. + uint64_t KeyIndentSize = Entry.Level * IndentSize; + uint64_t ValIndentSize = + MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize; + + llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key + << std::string(ValIndentSize, ' ') << Entry.Value + << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n"; + } + } +}; + +/// Class wrapping a __tgt_device_image and its offload entry table on a +/// specific device. This class is responsible for storing and managing +/// the offload entries for an image on a device. +class DeviceImageTy { + /// Image identifier within the corresponding device. Notice that this id is + /// not unique between different device; they may overlap. + int32_t ImageId; + + /// The pointer to the raw __tgt_device_image. + const __tgt_device_image *TgtImage; + const __tgt_device_image *TgtImageBitcode; + + /// Reference to the device this image is loaded on. + GenericDeviceTy &Device; + + /// If this image has any global destructors that much be called. + /// FIXME: This is only required because we currently have no invariants + /// towards the lifetime of the underlying image. We should either copy + /// the image into memory locally or erase the pointers after init. + bool PendingGlobalDtors; + +public: + DeviceImageTy(int32_t Id, GenericDeviceTy &Device, + const __tgt_device_image *Image) + : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device), + PendingGlobalDtors(false) { + assert(TgtImage && "Invalid target image"); + } + + /// Get the image identifier within the device. + int32_t getId() const { return ImageId; } + + /// Get the device that this image is loaded onto. + GenericDeviceTy &getDevice() const { return Device; } + + /// Get the pointer to the raw __tgt_device_image. + const __tgt_device_image *getTgtImage() const { return TgtImage; } + + void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) { + this->TgtImageBitcode = TgtImageBitcode; + } + + const __tgt_device_image *getTgtImageBitcode() const { + return TgtImageBitcode; + } + + /// Get the image starting address. + void *getStart() const { return TgtImage->ImageStart; } + + /// Get the image size. + size_t getSize() const { + return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart); + } + + /// Get a memory buffer reference to the whole image. + MemoryBufferRef getMemoryBuffer() const { + return MemoryBufferRef(StringRef((const char *)getStart(), getSize()), + "Image"); + } + /// Accessors to the boolean value + bool setPendingGlobalDtors() { return PendingGlobalDtors = true; } + bool hasPendingGlobalDtors() const { return PendingGlobalDtors; } +}; + +/// Class implementing common functionalities of offload kernels. Each plugin +/// should define the specific kernel class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericKernelTy { + /// Construct a kernel with a name and a execution mode. + GenericKernelTy(const char *Name) + : Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {} + + virtual ~GenericKernelTy() {} + + /// Initialize the kernel object from a specific device. + Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image); + virtual Error initImpl(GenericDeviceTy &GenericDevice, + DeviceImageTy &Image) = 0; + + /// Launch the kernel on the specific device. The device must be the same + /// one used to initialize the kernel. + Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const; + virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, + void *Args, + AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; + + /// Get the kernel name. + const char *getName() const { return Name; } + + /// Return true if this kernel is a constructor or destructor. + bool isCtorOrDtor() const { + // TODO: This is not a great solution and should be revisited. + return StringRef(Name).ends_with("tor"); + } + + /// Get the kernel image. + DeviceImageTy &getImage() const { + assert(ImagePtr && "Kernel is not initialized!"); + return *ImagePtr; + } + + /// Return the kernel environment object for kernel \p Name. + const KernelEnvironmentTy &getKernelEnvironmentForKernel() { + return KernelEnvironment; + } + + /// Return a device pointer to a new kernel launch environment. + Expected<KernelLaunchEnvironmentTy *> + getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version, + AsyncInfoWrapperTy &AsyncInfo) const; + + /// Indicate whether an execution mode is valid. + static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) { + switch (ExecutionMode) { + case OMP_TGT_EXEC_MODE_SPMD: + case OMP_TGT_EXEC_MODE_GENERIC: + case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + return true; + } + return false; + } + +protected: + /// Get the execution mode name of the kernel. + const char *getExecutionModeName() const { + switch (KernelEnvironment.Configuration.ExecMode) { + case OMP_TGT_EXEC_MODE_SPMD: + return "SPMD"; + case OMP_TGT_EXEC_MODE_GENERIC: + return "Generic"; + case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + return "Generic-SPMD"; + } + llvm_unreachable("Unknown execution mode!"); + } + + /// Prints generic kernel launch information. + Error printLaunchInfo(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, uint32_t NumThreads, + uint64_t NumBlocks) const; + + /// Prints plugin-specific kernel launch information after generic kernel + /// launch information + virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumThreads, + uint64_t NumBlocks) const; + +private: + /// Prepare the arguments before launching the kernel. + void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, uint32_t &NumArgs, + llvm::SmallVectorImpl<void *> &Args, + llvm::SmallVectorImpl<void *> &Ptrs, + KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const; + + /// Get the number of threads and blocks for the kernel based on the + /// user-defined threads and block clauses. + uint32_t getNumThreads(GenericDeviceTy &GenericDevice, + uint32_t ThreadLimitClause[3]) const; + + /// The number of threads \p NumThreads can be adjusted by this method. + /// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via + /// thread_limit clause. + uint64_t getNumBlocks(GenericDeviceTy &GenericDevice, + uint32_t BlockLimitClause[3], uint64_t LoopTripCount, + uint32_t &NumThreads, bool IsNumThreadsFromUser) const; + + /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. + bool isGenericSPMDMode() const { + return KernelEnvironment.Configuration.ExecMode == + OMP_TGT_EXEC_MODE_GENERIC_SPMD; + } + bool isGenericMode() const { + return KernelEnvironment.Configuration.ExecMode == + OMP_TGT_EXEC_MODE_GENERIC; + } + bool isSPMDMode() const { + return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD; + } + + /// The kernel name. + const char *Name; + + /// The image that contains this kernel. + DeviceImageTy *ImagePtr = nullptr; + +protected: + /// The preferred number of threads to run the kernel. + uint32_t PreferredNumThreads; + + /// The maximum number of threads which the kernel could leverage. + uint32_t MaxNumThreads; + + /// The kernel environment, including execution flags. + KernelEnvironmentTy KernelEnvironment; + + /// The prototype kernel launch environment. + KernelLaunchEnvironmentTy KernelLaunchEnvironment; + + /// If the kernel is a bare kernel. + bool IsBareKernel = false; +}; + +/// Class representing a map of host pinned allocations. We track these pinned +/// allocations, so memory tranfers invloving these buffers can be optimized. +class PinnedAllocationMapTy { + + /// Struct representing a map entry. + struct EntryTy { + /// The host pointer of the pinned allocation. + void *HstPtr; + + /// The pointer that devices' driver should use to transfer data from/to the + /// pinned allocation. In most plugins, this pointer will be the same as the + /// host pointer above. + void *DevAccessiblePtr; + + /// The size of the pinned allocation. + size_t Size; + + /// Indicate whether the allocation was locked from outside the plugin, for + /// instance, from the application. The externally locked allocations are + /// not unlocked by the plugin when unregistering the last user. + bool ExternallyLocked; + + /// The number of references to the pinned allocation. The allocation should + /// remain pinned and registered to the map until the number of references + /// becomes zero. + mutable size_t References; + + /// Create an entry with the host and device acessible pointers, the buffer + /// size, and a boolean indicating whether the buffer was locked externally. + EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size, + bool ExternallyLocked) + : HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size), + ExternallyLocked(ExternallyLocked), References(1) {} + + /// Utility constructor used for std::set searches. + EntryTy(void *HstPtr) + : HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0), + ExternallyLocked(false), References(0) {} + }; + + /// Comparator of mep entries. Use the host pointer to enforce an order + /// between entries. + struct EntryCmpTy { + bool operator()(const EntryTy &Left, const EntryTy &Right) const { + return Left.HstPtr < Right.HstPtr; + } + }; + + typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy; + + /// The map of host pinned allocations. + PinnedAllocSetTy Allocs; + + /// The mutex to protect accesses to the map. + mutable std::shared_mutex Mutex; + + /// Reference to the corresponding device. + GenericDeviceTy &Device; + + /// Indicate whether mapped host buffers should be locked automatically. + bool LockMappedBuffers; + + /// Indicate whether failures when locking mapped buffers should be ingored. + bool IgnoreLockMappedFailures; + + /// Find an allocation that intersects with \p HstPtr pointer. Assume the + /// map's mutex is acquired. + const EntryTy *findIntersecting(const void *HstPtr) const { + if (Allocs.empty()) + return nullptr; + + // Search the first allocation with starting address that is not less than + // the buffer address. + auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)}); + + // Direct match of starting addresses. + if (It != Allocs.end() && It->HstPtr == HstPtr) + return &(*It); + + // Not direct match but may be a previous pinned allocation in the map which + // contains the buffer. Return false if there is no such a previous + // allocation. + if (It == Allocs.begin()) + return nullptr; + + // Move to the previous pinned allocation. + --It; + + // The buffer is not contained in the pinned allocation. + if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr) + return &(*It); + + // None found. + return nullptr; + } + + /// Insert an entry to the map representing a locked buffer. The number of + /// references is set to one. + Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size, + bool ExternallyLocked = false); + + /// Erase an existing entry from the map. + Error eraseEntry(const EntryTy &Entry); + + /// Register a new user into an entry that represents a locked buffer. Check + /// also that the registered buffer with \p HstPtr address and \p Size is + /// actually contained into the entry. + Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size); + + /// Unregister a user from the entry and return whether it is the last user. + /// If it is the last user, the entry will have to be removed from the map + /// and unlock the entry's host buffer (if necessary). + Expected<bool> unregisterEntryUse(const EntryTy &Entry); + + /// Indicate whether the first range A fully contains the second range B. + static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { + void *EndA = advanceVoidPtr(PtrA, SizeA); + void *EndB = advanceVoidPtr(PtrB, SizeB); + return (PtrB >= PtrA && EndB <= EndA); + } + + /// Indicate whether the first range A intersects with the second range B. + static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) { + void *EndA = advanceVoidPtr(PtrA, SizeA); + void *EndB = advanceVoidPtr(PtrB, SizeB); + return (PtrA < EndB && PtrB < EndA); + } + +public: + /// Create the map of pinned allocations corresponding to a specific device. + PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) { + + // Envar that indicates whether mapped host buffers should be locked + // automatically. The possible values are boolean (on/off) and a special: + // off: Mapped host buffers are not locked. + // on: Mapped host buffers are locked in a best-effort approach. + // Failure to lock the buffers are silent. + // mandatory: Mapped host buffers are always locked and failures to lock + // a buffer results in a fatal error. + StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS", + "off"); + + bool Enabled; + if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) { + // Parsed as a boolean value. Enable the feature if necessary. + LockMappedBuffers = Enabled; + IgnoreLockMappedFailures = true; + } else if (OMPX_LockMappedBuffers.get() == "mandatory") { + // Enable the feature and failures are fatal. + LockMappedBuffers = true; + IgnoreLockMappedFailures = false; + } else { + // Disable by default. + DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n", + OMPX_LockMappedBuffers.get().data()); + LockMappedBuffers = false; + } + } + + /// Register a buffer that was recently allocated as a locked host buffer. + /// None of the already registered pinned allocations should intersect with + /// this new one. The registration requires the host pointer in \p HstPtr, + /// the device accessible pointer in \p DevAccessiblePtr, and the size of the + /// allocation in \p Size. The allocation must be unregistered using the + /// unregisterHostBuffer function. + Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size); + + /// Unregister a host pinned allocation passing the host pointer which was + /// previously registered using the registerHostBuffer function. When calling + /// this function, the pinned allocation cannot have any other user and will + /// not be unlocked by this function. + Error unregisterHostBuffer(void *HstPtr); + + /// Lock the host buffer at \p HstPtr or register a new user if it intersects + /// with an already existing one. A partial overlapping with extension is not + /// allowed. The function returns the device accessible pointer of the pinned + /// buffer. The buffer must be unlocked using the unlockHostBuffer function. + Expected<void *> lockHostBuffer(void *HstPtr, size_t Size); + + /// Unlock the host buffer at \p HstPtr or unregister a user if other users + /// are still using the pinned allocation. If this was the last user, the + /// pinned allocation is removed from the map and the memory is unlocked. + Error unlockHostBuffer(void *HstPtr); + + /// Lock or register a host buffer that was recently mapped by libomptarget. + /// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is + /// enabled. Even if not enabled, externally locked buffers are registered + /// in order to optimize their transfers. + Error lockMappedHostBuffer(void *HstPtr, size_t Size); + + /// Unlock or unregister a host buffer that was unmapped by libomptarget. + Error unlockUnmappedHostBuffer(void *HstPtr); + + /// Return the device accessible pointer associated to the host pinned + /// allocation which the \p HstPtr belongs, if any. Return null in case the + /// \p HstPtr does not belong to any host pinned allocation. The device + /// accessible pointer is the one that devices should use for data transfers + /// that involve a host pinned buffer. + void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const { + std::shared_lock<std::shared_mutex> Lock(Mutex); + + // Find the intersecting allocation if any. + const EntryTy *Entry = findIntersecting(HstPtr); + if (!Entry) + return nullptr; + + return advanceVoidPtr(Entry->DevAccessiblePtr, + getPtrDiff(HstPtr, Entry->HstPtr)); + } + + /// Check whether a buffer belongs to a registered host pinned allocation. + bool isHostPinnedBuffer(const void *HstPtr) const { + std::shared_lock<std::shared_mutex> Lock(Mutex); + + // Return whether there is an intersecting allocation. + return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr); + } +}; + +/// Class implementing common functionalities of offload devices. Each plugin +/// should define the specific device class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericDeviceTy : public DeviceAllocatorTy { + /// Construct a device with its device id within the plugin, the number of + /// devices in the plugin and the grid values for that kind of device. + GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, + const llvm::omp::GV &GridValues); + + /// Get the device identifier within the corresponding plugin. Notice that + /// this id is not unique between different plugins; they may overlap. + int32_t getDeviceId() const { return DeviceId; } + + /// Set the context of the device if needed, before calling device-specific + /// functions. Plugins may implement this function as a no-op if not needed. + virtual Error setContext() = 0; + + /// Initialize the device. After this call, the device should be already + /// working and ready to accept queries or modifications. + Error init(GenericPluginTy &Plugin); + virtual Error initImpl(GenericPluginTy &Plugin) = 0; + + /// Deinitialize the device and free all its resources. After this call, the + /// device is no longer considered ready, so no queries or modifications are + /// allowed. + Error deinit(GenericPluginTy &Plugin); + virtual Error deinitImpl() = 0; + + /// Load the binary image into the device and return the target table. + Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin, + const __tgt_device_image *TgtImage); + virtual Expected<DeviceImageTy *> + loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0; + + /// Setup the device environment if needed. Notice this setup may not be run + /// on some plugins. By default, it will be executed, but plugins can change + /// this behavior by overriding the shouldSetupDeviceEnvironment function. + Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); + + /// Setup the global device memory pool, if the plugin requires one. + Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, + uint64_t PoolSize); + + // Setup the RPC server for this device if needed. This may not run on some + // plugins like the CPU targets. By default, it will not be executed so it is + // up to the target to override this using the shouldSetupRPCServer function. + Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image); + + /// Synchronize the current thread with the pending operations on the + /// __tgt_async_info structure. + Error synchronize(__tgt_async_info *AsyncInfo); + virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0; + + /// Invokes any global constructors on the device if present and is required + /// by the target. + virtual Error callGlobalConstructors(GenericPluginTy &Plugin, + DeviceImageTy &Image) { + return Error::success(); + } + + /// Invokes any global destructors on the device if present and is required + /// by the target. + virtual Error callGlobalDestructors(GenericPluginTy &Plugin, + DeviceImageTy &Image) { + return Error::success(); + } + + /// Query for the completion of the pending operations on the __tgt_async_info + /// structure in a non-blocking manner. + Error queryAsync(__tgt_async_info *AsyncInfo); + virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0; + + /// Check whether the architecture supports VA management + virtual bool supportVAManagement() const { return false; } + + /// Get the total device memory size + virtual Error getDeviceMemorySize(uint64_t &DSize); + + /// Allocates \p RSize bytes (rounded up to page size) and hints the driver to + /// map it to \p VAddr. The obtained address is stored in \p Addr. At return + /// \p RSize contains the actual size which can be equal or larger than the + /// requested size. + virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize); + + /// De-allocates device memory and unmaps the virtual address \p VAddr + virtual Error memoryVAUnMap(void *VAddr, size_t Size); + + /// Allocate data on the device or involving the device. + Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind); + + /// Deallocate data from the device or involving the device. + Error dataDelete(void *TgtPtr, TargetAllocTy Kind); + + /// Pin host memory to optimize transfers and return the device accessible + /// pointer that devices should use for memory transfers involving the host + /// pinned allocation. + Expected<void *> dataLock(void *HstPtr, int64_t Size) { + return PinnedAllocs.lockHostBuffer(HstPtr, Size); + } + + /// Unpin a host memory buffer that was previously pinned. + Error dataUnlock(void *HstPtr) { + return PinnedAllocs.unlockHostBuffer(HstPtr); + } + + /// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific + /// API and return the device accessible pointer. + virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0; + + /// Unlock a previously locked host buffer starting at \p HstPtr. + virtual Error dataUnlockImpl(void *HstPtr) = 0; + + /// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped + /// buffer. This means that libomptarget created a new mapping of that host + /// buffer (e.g., because a user OpenMP target map) and the buffer may be used + /// as source/destination of memory transfers. We can use this information to + /// lock the host buffer and optimize its memory transfers. + Error notifyDataMapped(void *HstPtr, int64_t Size) { + return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size); + } + + /// Mark the host buffer with address \p HstPtr as unmapped. This means that + /// libomptarget removed an existing mapping. If the plugin locked the buffer + /// in notifyDataMapped, this function should unlock it. + Error notifyDataUnmapped(void *HstPtr) { + return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr); + } + + /// Check whether the host buffer with address \p HstPtr is pinned by the + /// underlying vendor-specific runtime (if any). Retrieve the host pointer, + /// the device accessible pointer and the size of the original pinned buffer. + virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, + void *&BaseDevAccessiblePtr, + size_t &BaseSize) const = 0; + + /// Submit data to the device (host to device transfer). + Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Retrieve data from the device (device to host transfer). + Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Exchange data between devices (device to device transfer). Calling this + /// function is only valid if GenericPlugin::isDataExchangable() passing the + /// two devices returns true. + Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr, + int64_t Size, __tgt_async_info *AsyncInfo); + virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Run the kernel associated with \p EntryPtr + Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, + KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo); + + /// Initialize a __tgt_async_info structure. Related to interop features. + Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr); + virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Initialize a __tgt_device_info structure. Related to interop features. + Error initDeviceInfo(__tgt_device_info *DeviceInfo); + virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0; + + /// Create an event. + Error createEvent(void **EventPtrStorage); + virtual Error createEventImpl(void **EventPtrStorage) = 0; + + /// Destroy an event. + Error destroyEvent(void *Event); + virtual Error destroyEventImpl(void *EventPtr) = 0; + + /// Start the recording of the event. + Error recordEvent(void *Event, __tgt_async_info *AsyncInfo); + virtual Error recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Wait for an event to finish. Notice this wait is asynchronous if the + /// __tgt_async_info is not nullptr. + Error waitEvent(void *Event, __tgt_async_info *AsyncInfo); + virtual Error waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Synchronize the current thread with the event. + Error syncEvent(void *EventPtr); + virtual Error syncEventImpl(void *EventPtr) = 0; + + /// Print information about the device. + Error printInfo(); + virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0; + + /// Getters of the grid values. + uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; } + uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; } + uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; } + uint32_t getDefaultNumThreads() const { + return GridValues.GV_Default_WG_Size; + } + uint32_t getDefaultNumBlocks() const { + return GridValues.GV_Default_Num_Teams; + } + uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } + virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } + + /// Get target compute unit kind (e.g., sm_80, or gfx908). + virtual std::string getComputeUnitKind() const { return "unknown"; } + + /// Post processing after jit backend. The ownership of \p MB will be taken. + virtual Expected<std::unique_ptr<MemoryBuffer>> + doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const { + return std::move(MB); + } + + /// The minimum number of threads we use for a low-trip count combined loop. + /// Instead of using more threads we increase the outer (block/team) + /// parallelism. + /// @see OMPX_MinThreadsForLowTripCount + virtual uint32_t getMinThreadsForLowTripCountLoop() { + return OMPX_MinThreadsForLowTripCount; + } + + /// Get the total amount of hardware parallelism supported by the target + /// device. This is the total amount of warps or wavefronts that can be + /// resident on the device simultaneously. + virtual uint64_t getHardwareParallelism() const { return 0; } + + /// Get the RPC server running on this device. + RPCServerTy *getRPCServer() const { return RPCServer; } + + /// The number of parallel RPC ports to use on the device. In general, this + /// should be roughly equivalent to the amount of hardware parallelism the + /// device can support. This is because GPUs in general do not have forward + /// progress guarantees, so we minimize thread level dependencies by + /// allocating enough space such that each device thread can have a port. This + /// is likely overly pessimistic in the average case, but guarantees no + /// deadlocks at the cost of memory. This must be overloaded by targets + /// expecting to use the RPC server. + virtual uint64_t requestedRPCPortCount() const { + assert(!shouldSetupRPCServer() && "Default implementation cannot be used"); + return 0; + } + + virtual Error getDeviceStackSize(uint64_t &V) = 0; + + /// Returns true if current plugin architecture is an APU + /// and unified_shared_memory was not requested by the program. + bool useAutoZeroCopy(); + virtual bool useAutoZeroCopyImpl() { return false; } + + /// Allocate and construct a kernel object. + virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0; + + /// Reference to the underlying plugin that created this device. + GenericPluginTy &Plugin; + +private: + /// Get and set the stack size and heap size for the device. If not used, the + /// plugin can implement the setters as no-op and setting the output + /// value to zero for the getters. + virtual Error setDeviceStackSize(uint64_t V) = 0; + virtual Error getDeviceHeapSize(uint64_t &V) = 0; + virtual Error setDeviceHeapSize(uint64_t V) = 0; + + /// Indicate whether the device should setup the device environment. Notice + /// that returning false in this function will change the behavior of the + /// setupDeviceEnvironment() function. + virtual bool shouldSetupDeviceEnvironment() const { return true; } + + /// Indicate whether the device should setup the global device memory pool. If + /// false is return the value on the device will be uninitialized. + virtual bool shouldSetupDeviceMemoryPool() const { return true; } + + /// Indicate whether or not the device should setup the RPC server. This is + /// only necessary for unhosted targets like the GPU. + virtual bool shouldSetupRPCServer() const { return false; } + + /// Pointer to the memory manager or nullptr if not available. + MemoryManagerTy *MemoryManager; + + /// Environment variables defined by the OpenMP standard. + Int32Envar OMP_TeamLimit; + Int32Envar OMP_NumTeams; + Int32Envar OMP_TeamsThreadLimit; + + /// Environment variables defined by the LLVM OpenMP implementation. + Int32Envar OMPX_DebugKind; + UInt32Envar OMPX_SharedMemorySize; + UInt64Envar OMPX_TargetStackSize; + UInt64Envar OMPX_TargetHeapSize; + + /// Environment flag to set the minimum number of threads we use for a + /// low-trip count combined loop. Instead of using more threads we increase + /// the outer (block/team) parallelism. + UInt32Envar OMPX_MinThreadsForLowTripCount = + UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32); + +protected: + /// Environment variables defined by the LLVM OpenMP implementation + /// regarding the initial number of streams and events. + UInt32Envar OMPX_InitialNumStreams; + UInt32Envar OMPX_InitialNumEvents; + + /// Array of images loaded into the device. Images are automatically + /// deallocated by the allocator. + llvm::SmallVector<DeviceImageTy *> LoadedImages; + + /// The identifier of the device within the plugin. Notice this is not a + /// global device id and is not the device id visible to the OpenMP user. + const int32_t DeviceId; + + /// The default grid values used for this device. + llvm::omp::GV GridValues; + + /// Enumeration used for representing the current state between two devices + /// two devices (both under the same plugin) for the peer access between them. + /// The states can be a) PENDING when the state has not been queried and needs + /// to be queried, b) AVAILABLE when the peer access is available to be used, + /// and c) UNAVAILABLE if the system does not allow it. + enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING }; + + /// Array of peer access states with the rest of devices. This means that if + /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE, + /// the device I can access device J's memory directly. However, notice this + /// does not mean that device J can access device I's memory directly. + llvm::SmallVector<PeerAccessState> PeerAccesses; + std::mutex PeerAccessesLock; + + /// Map of host pinned allocations used for optimize device transfers. + PinnedAllocationMapTy PinnedAllocs; + + /// A pointer to an RPC server instance attached to this device if present. + /// This is used to run the RPC server during task synchronization. + RPCServerTy *RPCServer; + +#ifdef OMPT_SUPPORT + /// OMPT callback functions +#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr; + FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback) +#undef defineOmptCallback + + /// Internal representation for OMPT device (initialize & finalize) + std::atomic<bool> OmptInitialized; +#endif + +private: + DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; + DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; +}; + +/// Class implementing common functionalities of offload plugins. Each plugin +/// should define the specific plugin class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericPluginTy { + + /// Construct a plugin instance. + GenericPluginTy(Triple::ArchType TA) + : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr), JIT(TA), + RPCServer(nullptr) {} + + virtual ~GenericPluginTy() {} + + /// Initialize the plugin. + Error init(); + + /// Initialize the plugin and return the number of available devices. + virtual Expected<int32_t> initImpl() = 0; + + /// Deinitialize the plugin and release the resources. + Error deinit(); + virtual Error deinitImpl() = 0; + + /// Create a new device for the underlying plugin. + virtual GenericDeviceTy *createDevice(GenericPluginTy &Plugin, + int32_t DeviceID, + int32_t NumDevices) = 0; + + /// Create a new global handler for the underlying plugin. + virtual GenericGlobalHandlerTy *createGlobalHandler() = 0; + + /// Get the reference to the device with a certain device id. + GenericDeviceTy &getDevice(int32_t DeviceId) { + assert(isValidDeviceId(DeviceId) && "Invalid device id"); + assert(Devices[DeviceId] && "Device is unitialized"); + + return *Devices[DeviceId]; + } + + /// Get the number of active devices. + int32_t getNumDevices() const { return NumDevices; } + + /// Get the plugin-specific device identifier offset. + int32_t getDeviceIdStartIndex() const { return DeviceIdStartIndex; } + + /// Set the plugin-specific device identifier offset. + void setDeviceIdStartIndex(int32_t Offset) { DeviceIdStartIndex = Offset; } + + /// Get the ELF code to recognize the binary image of this plugin. + virtual uint16_t getMagicElfBits() const = 0; + + /// Get the target triple of this plugin. + virtual Triple::ArchType getTripleArch() const = 0; + + /// Allocate a structure using the internal allocator. + template <typename Ty> Ty *allocate() { + return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty))); + } + + /// Get the reference to the global handler of this plugin. + GenericGlobalHandlerTy &getGlobalHandler() { + assert(GlobalHandler && "Global handler not initialized"); + return *GlobalHandler; + } + + /// Get the reference to the JIT used for all devices connected to this + /// plugin. + JITEngine &getJIT() { return JIT; } + + /// Get a reference to the RPC server used to provide host services. + RPCServerTy &getRPCServer() { + assert(RPCServer && "RPC server not initialized"); + return *RPCServer; + } + + /// Get the OpenMP requires flags set for this plugin. + int64_t getRequiresFlags() const { return RequiresFlags; } + + /// Set the OpenMP requires flags for this plugin. + void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; } + + /// Initialize a device within the plugin. + Error initDevice(int32_t DeviceId); + + /// Deinitialize a device within the plugin and release its resources. + Error deinitDevice(int32_t DeviceId); + + /// Indicate whether data can be exchanged directly between two devices under + /// this same plugin. If this function returns true, it's safe to call the + /// GenericDeviceTy::exchangeData() function on the source device. + virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) { + return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId); + } + + /// Top level interface to verify if a given ELF image can be executed on a + /// given target. Returns true if the \p Image is compatible with the plugin. + Expected<bool> checkELFImage(StringRef Image) const; + + /// Indicate if an image is compatible with the plugin devices. Notice that + /// this function may be called before actually initializing the devices. So + /// we could not move this function into GenericDeviceTy. + virtual Expected<bool> isELFCompatible(StringRef Image) const = 0; + +protected: + /// Indicate whether a device id is valid. + bool isValidDeviceId(int32_t DeviceId) const { + return (DeviceId >= 0 && DeviceId < getNumDevices()); + } + +public: + // TODO: This plugin interface needs to be cleaned up. + + /// Returns non-zero if the provided \p Image can be executed by the runtime. + int32_t is_valid_binary(__tgt_device_image *Image); + + /// Initialize the device inside of the plugin. + int32_t init_device(int32_t DeviceId); + + /// Return the number of devices this plugin can support. + int32_t number_of_devices(); + + /// Initializes the OpenMP register requires information. + int64_t init_requires(int64_t RequiresFlags); + + /// Returns non-zero if the data can be exchanged between the two devices. + int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId); + + /// Initializes the record and replay mechanism inside the plugin. + int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize, + void *VAddr, bool isRecord, bool SaveOutput, + uint64_t &ReqPtrArgOffset); + + /// Loads the associated binary into the plugin and returns a handle to it. + int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage, + __tgt_device_binary *Binary); + + /// Allocates memory that is accessively to the given device. + void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind); + + /// Deallocates memory on the given device. + int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind); + + /// Locks / pins host memory using the plugin runtime. + int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size, + void **LockedPtr); + + /// Unlocks / unpins host memory using the plugin runtime. + int32_t data_unlock(int32_t DeviceId, void *Ptr); + + /// Notify the runtime about a new mapping that has been created outside. + int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size); + + /// Notify t he runtime about a mapping that has been deleted. + int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr); + + /// Copy data to the given device. + int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, + int64_t Size); + + /// Copy data to the given device asynchronously. + int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr, + int64_t Size, __tgt_async_info *AsyncInfoPtr); + + /// Copy data from the given device. + int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, + int64_t Size); + + /// Copy data from the given device asynchornously. + int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr, + int64_t Size, __tgt_async_info *AsyncInfoPtr); + + /// Exchange memory addresses between two devices. + int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId, + void *DstPtr, int64_t Size); + + /// Exchange memory addresses between two devices asynchronously. + int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, + int DstDeviceId, void *DstPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + + /// Begin executing a kernel on the given device. + int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, + ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs, + __tgt_async_info *AsyncInfoPtr); + + /// Synchronize an asyncrhonous queue with the plugin runtime. + int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr); + + /// Query the current state of an asynchronous queue. + int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr); + + /// Prints information about the given devices supported by the plugin. + void print_device_info(int32_t DeviceId); + + /// Creates an event in the given plugin if supported. + int32_t create_event(int32_t DeviceId, void **EventPtr); + + /// Records an event that has occurred. + int32_t record_event(int32_t DeviceId, void *EventPtr, + __tgt_async_info *AsyncInfoPtr); + + /// Wait until an event has occurred. + int32_t wait_event(int32_t DeviceId, void *EventPtr, + __tgt_async_info *AsyncInfoPtr); + + /// Syncrhonize execution until an event is done. + int32_t sync_event(int32_t DeviceId, void *EventPtr); + + /// Remove the event from the plugin. + int32_t destroy_event(int32_t DeviceId, void *EventPtr); + + /// Remove the event from the plugin. + void set_info_flag(uint32_t NewInfoLevel); + + /// Creates an asynchronous queue for the given plugin. + int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr); + + /// Creates device information to be used for diagnostics. + int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo, + const char **ErrStr); + + /// Sets the offset into the devices for use by OMPT. + int32_t set_device_offset(int32_t DeviceIdOffset); + + /// Returns if the plugin can support auotmatic copy. + int32_t use_auto_zero_copy(int32_t DeviceId); + + /// Look up a global symbol in the given binary. + int32_t get_global(__tgt_device_binary Binary, uint64_t Size, + const char *Name, void **DevicePtr); + + /// Look up a kernel function in the given binary. + int32_t get_function(__tgt_device_binary Binary, const char *Name, + void **KernelPtr); + +private: + /// Number of devices available for the plugin. + int32_t NumDevices = 0; + + /// Index offset, which when added to a DeviceId, will yield a unique + /// user-observable device identifier. This is especially important when + /// DeviceIds of multiple plugins / RTLs need to be distinguishable. + int32_t DeviceIdStartIndex = 0; + + /// Array of pointers to the devices. Initially, they are all set to nullptr. + /// Once a device is initialized, the pointer is stored in the position given + /// by its device id. A position with nullptr means that the corresponding + /// device was not initialized yet. + llvm::SmallVector<GenericDeviceTy *> Devices; + + /// OpenMP requires flags. + int64_t RequiresFlags; + + /// Pointer to the global handler for this plugin. + GenericGlobalHandlerTy *GlobalHandler; + + /// Internal allocator for different structures. + BumpPtrAllocator Allocator; + + /// The JIT engine shared by all devices connected to this plugin. + JITEngine JIT; + + /// The interface between the plugin and the GPU for host services. + RPCServerTy *RPCServer; +}; + +namespace Plugin { +/// Create a success error. This is the same as calling Error::success(), but +/// it is recommended to use this one for consistency with Plugin::error() and +/// Plugin::check(). +static Error success() { return Error::success(); } + +/// Create a string error. +template <typename... ArgsTy> +static Error error(const char *ErrFmt, ArgsTy... Args) { + return createStringError(inconvertibleErrorCode(), ErrFmt, Args...); +} + +/// Check the plugin-specific error code and return an error or success +/// accordingly. In case of an error, create a string error with the error +/// description. The ErrFmt should follow the format: +/// "Error in <function name>[<optional info>]: %s" +/// The last format specifier "%s" is mandatory and will be used to place the +/// error code's description. Notice this function should be only called from +/// the plugin-specific code. +/// TODO: Refactor this, must be defined individually by each plugin. +template <typename... ArgsTy> +static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args); +} // namespace Plugin + +/// Class for simplifying the getter operation of the plugin. Anywhere on the +/// code, the current plugin can be retrieved by Plugin::get(). The class also +/// declares functions to create plugin-specific object instances. The check(), +/// createPlugin(), createDevice() and createGlobalHandler() functions should be +/// defined by each plugin implementation. +class PluginTy { + // Reference to the plugin instance. + static GenericPluginTy *SpecificPlugin; + + PluginTy() { + if (auto Err = init()) + REPORT("Failed to initialize plugin: %s\n", + toString(std::move(Err)).data()); + } + + ~PluginTy() { + if (auto Err = deinit()) + REPORT("Failed to deinitialize plugin: %s\n", + toString(std::move(Err)).data()); + } + + PluginTy(const PluginTy &) = delete; + void operator=(const PluginTy &) = delete; + + /// Create and intialize the plugin instance. + static Error init() { + assert(!SpecificPlugin && "Plugin already created"); + + // Create the specific plugin. + SpecificPlugin = createPlugin(); + assert(SpecificPlugin && "Plugin was not created"); + + // Initialize the plugin. + return SpecificPlugin->init(); + } + + // Deinitialize and destroy the plugin instance. + static Error deinit() { + assert(SpecificPlugin && "Plugin no longer valid"); + + for (int32_t DevNo = 0, NumDev = SpecificPlugin->getNumDevices(); + DevNo < NumDev; ++DevNo) + if (auto Err = SpecificPlugin->deinitDevice(DevNo)) + return Err; + + // Deinitialize the plugin. + if (auto Err = SpecificPlugin->deinit()) + return Err; + + // Delete the plugin instance. + delete SpecificPlugin; + + // Invalidate the plugin reference. + SpecificPlugin = nullptr; + + return Plugin::success(); + } + +public: + /// Initialize the plugin if needed. The plugin could have been initialized by + /// a previous call to Plugin::get(). + static Error initIfNeeded() { + // Trigger the initialization if needed. + get(); + + return Error::success(); + } + + /// Get a reference (or create if it was not created) to the plugin instance. + static GenericPluginTy &get() { + // This static variable will initialize the underlying plugin instance in + // case there was no previous explicit initialization. The initialization is + // thread safe. + static PluginTy Plugin; + + assert(SpecificPlugin && "Plugin is not active"); + return *SpecificPlugin; + } + + /// Get a reference to the plugin with a specific plugin-specific type. + template <typename Ty> static Ty &get() { return static_cast<Ty &>(get()); } + + /// Indicate whether the plugin is active. + static bool isActive() { return SpecificPlugin != nullptr; } + + /// Create a plugin instance. + static GenericPluginTy *createPlugin(); +}; + +/// Auxiliary interface class for GenericDeviceResourceManagerTy. This class +/// acts as a reference to a device resource, such as a stream, and requires +/// some basic functions to be implemented. The derived class should define an +/// empty constructor that creates an empty and invalid resource reference. Do +/// not create a new resource on the ctor, but on the create() function instead. +/// +/// The derived class should also define the type HandleTy as the underlying +/// resource handle type. For instance, in a CUDA stream it would be: +/// using HandleTy = CUstream; +struct GenericDeviceResourceRef { + /// Create a new resource and stores a reference. + virtual Error create(GenericDeviceTy &Device) = 0; + + /// Destroy and release the resources pointed by the reference. + virtual Error destroy(GenericDeviceTy &Device) = 0; + +protected: + ~GenericDeviceResourceRef() = default; +}; + +/// Class that implements a resource pool belonging to a device. This class +/// operates with references to the actual resources. These reference must +/// derive from the GenericDeviceResourceRef class and implement the create +/// and destroy virtual functions. +template <typename ResourceRef> class GenericDeviceResourceManagerTy { + using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>; + using ResourceHandleTy = typename ResourceRef::HandleTy; + +public: + /// Create an empty resource pool for a specific device. + GenericDeviceResourceManagerTy(GenericDeviceTy &Device) + : Device(Device), NextAvailable(0) {} + + /// Destroy the resource pool. At this point, the deinit() function should + /// already have been executed so the resource pool should be empty. + virtual ~GenericDeviceResourceManagerTy() { + assert(ResourcePool.empty() && "Resource pool not empty"); + } + + /// Initialize the resource pool. + Error init(uint32_t InitialSize) { + assert(ResourcePool.empty() && "Resource pool already initialized"); + return ResourcePoolTy::resizeResourcePool(InitialSize); + } + + /// Deinitialize the resource pool and delete all resources. This function + /// must be called before the destructor. + virtual Error deinit() { + if (NextAvailable) + DP("Missing %d resources to be returned\n", NextAvailable); + + // TODO: This prevents a bug on libomptarget to make the plugins fail. There + // may be some resources not returned. Do not destroy these ones. + if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable)) + return Err; + + ResourcePool.clear(); + + return Plugin::success(); + } + + /// Get a resource from the pool or create new ones. If the function + /// succeeds, the handle to the resource is saved in \p Handle. + virtual Error getResource(ResourceHandleTy &Handle) { + // Get a resource with an empty resource processor. + return getResourcesImpl(1, &Handle, + [](ResourceHandleTy) { return Plugin::success(); }); + } + + /// Get multiple resources from the pool or create new ones. If the function + /// succeeds, the handles to the resources are saved in \p Handles. + virtual Error getResources(uint32_t Num, ResourceHandleTy *Handles) { + // Get resources with an empty resource processor. + return getResourcesImpl(Num, Handles, + [](ResourceHandleTy) { return Plugin::success(); }); + } + + /// Return resource to the pool. + virtual Error returnResource(ResourceHandleTy Handle) { + // Return a resource with an empty resource processor. + return returnResourceImpl( + Handle, [](ResourceHandleTy) { return Plugin::success(); }); + } + +protected: + /// Get multiple resources from the pool or create new ones. If the function + /// succeeds, the handles to the resources are saved in \p Handles. Also + /// process each of the obtained resources with \p Processor. + template <typename FuncTy> + Error getResourcesImpl(uint32_t Num, ResourceHandleTy *Handles, + FuncTy Processor) { + const std::lock_guard<std::mutex> Lock(Mutex); + + assert(NextAvailable <= ResourcePool.size() && + "Resource pool is corrupted"); + + if (NextAvailable + Num > ResourcePool.size()) + // Double the resource pool or resize it to provide the requested ones. + if (auto Err = ResourcePoolTy::resizeResourcePool( + std::max(NextAvailable * 2, NextAvailable + Num))) + return Err; + + // Save the handles in the output array parameter. + for (uint32_t r = 0; r < Num; ++r) + Handles[r] = ResourcePool[NextAvailable + r]; + + // Process all obtained resources. + for (uint32_t r = 0; r < Num; ++r) + if (auto Err = Processor(Handles[r])) + return Err; + + NextAvailable += Num; + + return Plugin::success(); + } + + /// Return resource to the pool and process the resource with \p Processor. + template <typename FuncTy> + Error returnResourceImpl(ResourceHandleTy Handle, FuncTy Processor) { + const std::lock_guard<std::mutex> Lock(Mutex); + + // Process the returned resource. + if (auto Err = Processor(Handle)) + return Err; + + assert(NextAvailable > 0 && "Resource pool is corrupted"); + ResourcePool[--NextAvailable] = Handle; + + return Plugin::success(); + } + +protected: + /// The resources between \p OldSize and \p NewSize need to be created or + /// destroyed. The mutex is locked when this function is called. + Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) { + assert(OldSize != NewSize && "Resizing to the same size"); + + if (auto Err = Device.setContext()) + return Err; + + if (OldSize < NewSize) { + // Create new resources. + for (uint32_t I = OldSize; I < NewSize; ++I) { + if (auto Err = ResourcePool[I].create(Device)) + return Err; + } + } else { + // Destroy the obsolete resources. + for (uint32_t I = NewSize; I < OldSize; ++I) { + if (auto Err = ResourcePool[I].destroy(Device)) + return Err; + } + } + return Plugin::success(); + } + + /// Increase or decrease the number of resources. This function should + /// be called with the mutex acquired. + Error resizeResourcePool(uint32_t NewSize) { + uint32_t OldSize = ResourcePool.size(); + + // Nothing to do. + if (OldSize == NewSize) + return Plugin::success(); + + if (OldSize < NewSize) { + // Increase the number of resources. + ResourcePool.resize(NewSize); + return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); + } + + // Decrease the number of resources otherwise. + auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); + ResourcePool.resize(NewSize); + + return Err; + } + + /// The device to which the resources belong + GenericDeviceTy &Device; + + /// Mutex for the resource pool. + std::mutex Mutex; + + /// The next available resource in the pool. + uint32_t NextAvailable; + + /// The actual resource pool. + std::deque<ResourceRef> ResourcePool; +}; + +/// A static check on whether or not we support RPC in libomptarget. +bool libomptargetSupportsRPC(); + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H |
