diff options
Diffstat (limited to 'offload/plugins-nextgen')
| -rw-r--r-- | offload/plugins-nextgen/amdgpu/src/rtl.cpp | 120 | ||||
| -rw-r--r-- | offload/plugins-nextgen/common/include/PluginInterface.h | 23 | ||||
| -rw-r--r-- | offload/plugins-nextgen/common/src/PluginInterface.cpp | 111 | ||||
| -rw-r--r-- | offload/plugins-nextgen/cuda/src/rtl.cpp | 14 | ||||
| -rw-r--r-- | offload/plugins-nextgen/host/src/rtl.cpp | 1 |
5 files changed, 128 insertions, 141 deletions
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 7ba55715ff58..c26cfe961aa0 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -924,6 +924,7 @@ private: void *Dst; const void *Src; size_t Size; + size_t NumTimes; }; /// Utility struct holding arguments for freeing buffers to memory managers. @@ -974,9 +975,14 @@ private: StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {} /// Schedule a host memory copy action on the slot. - Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) { + /// + /// Num times will repeat the copy that many times, sequentually in the dest + /// buffer. + Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size, + size_t NumTimes = 1) { Callbacks.emplace_back(memcpyAction); - ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size}; + ActionArgs.emplace_back().MemcpyArgs = + MemcpyArgsTy{Dst, Src, Size, NumTimes}; return Plugin::success(); } @@ -1216,7 +1222,11 @@ private: assert(Args->Dst && "Invalid destination buffer"); assert(Args->Src && "Invalid source buffer"); - std::memcpy(Args->Dst, Args->Src, Args->Size); + auto BasePtr = Args->Dst; + for (size_t I = 0; I < Args->NumTimes; I++) { + std::memcpy(BasePtr, Args->Src, Args->Size); + BasePtr = reinterpret_cast<uint8_t *>(BasePtr) + Args->Size; + } return Plugin::success(); } @@ -1421,7 +1431,8 @@ public: /// manager once the operation completes. Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, uint64_t CopySize, - AMDGPUMemoryManagerTy &MemoryManager) { + AMDGPUMemoryManagerTy &MemoryManager, + size_t NumTimes = 1) { // Retrieve available signals for the operation's outputs. AMDGPUSignalTy *OutputSignals[2] = {}; if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) @@ -1443,7 +1454,8 @@ public: // The std::memcpy is done asynchronously using an async handler. We store // the function's information in the action but it is not actually a // post action. - if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize)) + if (auto Err = + Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes)) return Err; // Make changes on this slot visible to the async handler's thread. @@ -1464,7 +1476,11 @@ public: std::tie(Curr, InputSignal) = consume(OutputSignal); } else { // All preceding operations completed, copy the memory synchronously. - std::memcpy(Inter, Src, CopySize); + auto *InterPtr = Inter; + for (size_t I = 0; I < NumTimes; I++) { + std::memcpy(InterPtr, Src, CopySize); + InterPtr = reinterpret_cast<uint8_t *>(InterPtr) + CopySize; + } // Return the second signal because it will not be used. OutputSignals[1]->decreaseUseCount(); @@ -1481,11 +1497,11 @@ public: if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + Agent, CopySize * NumTimes, 1, + &InputSignalRaw, OutputSignal->get()); } return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 0, nullptr, + Agent, CopySize * NumTimes, 0, nullptr, OutputSignal->get()); } @@ -2611,26 +2627,73 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { - hsa_status_t Status; + // Fast case, where we can use the 4 byte hsa_amd_memory_fill + if (Size % 4 == 0 && + (PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) { + uint32_t Pattern; + if (PatternSize == 1) { + auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr); + Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24; + } else if (PatternSize == 2) { + auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr); + Pattern = *Word | (*Word << 16); + } else if (PatternSize == 4) { + Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr); + } else { + // Shouldn't be here if the pattern size is outwith those values + llvm_unreachable("Invalid pattern size"); + } - // We can use hsa_amd_memory_fill for this size, but it's not async so the - // queue needs to be synchronized first - if (PatternSize == 4) { - if (AsyncInfoWrapper.hasQueue()) - if (auto Err = synchronize(AsyncInfoWrapper)) + if (hasPendingWorkImpl(AsyncInfoWrapper)) { + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; - Status = hsa_amd_memory_fill(TgtPtr, - *static_cast<const uint32_t *>(PatternPtr), - Size / PatternSize); - if (auto Err = - Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n")) - return Err; - } else { - // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned - // memory and copying to the device in one go. - return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size"); + struct MemFillArgsTy { + void *Dst; + uint32_t Pattern; + int64_t Size; + }; + auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4}; + auto Fill = [](void *Data) { + MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data); + assert(Args && "Invalid arguments"); + + auto Status = + hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size); + delete Args; + auto Err = + Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"); + if (Err) { + FATAL_MESSAGE(1, "error performing async fill: %s", + toString(std::move(Err)).data()); + } + }; + + // hsa_amd_memory_fill doesn't signal completion using a signal, so use + // the existing host callback logic to handle that instead + return Stream->pushHostCallback(Fill, Args); + } else { + // If there is no pending work, do the fill synchronously + auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4); + return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"); + } } + + // Slow case; allocate an appropriate memory size and enqueue copies + void *PinnedPtr = nullptr; + AMDGPUMemoryManagerTy &PinnedMemoryManager = + HostDevice.getPinnedMemoryManager(); + if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) + return Err; + + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr, + PatternSize, PinnedMemoryManager, + Size / PatternSize); } /// Initialize the async info for interoperability purposes. @@ -2744,7 +2807,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Product Name", TmpChar); + Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME); Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) @@ -2861,11 +2924,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); if (Status == HSA_STATUS_SUCCESS) - Info.add("Grid Max Size", TmpUInt); + Info.add("Grid Max Size", TmpUInt, "", DeviceInfo::MAX_WORK_SIZE); Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); if (Status == HSA_STATUS_SUCCESS) { - auto &MaxDim = *Info.add("Grid Max Size per Dimension"); + auto &MaxDim = *Info.add("Grid Max Size per Dimension", std::monostate{}, + "", DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION); MaxDim.add("x", GridMaxDim.x); MaxDim.add("y", GridMaxDim.y); MaxDim.add("z", GridMaxDim.z); diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 75f87cab6049..6ff3ef8cda17 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -417,6 +417,7 @@ struct GenericKernelTy { case OMP_TGT_EXEC_MODE_SPMD: case OMP_TGT_EXEC_MODE_GENERIC: case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP: return true; } return false; @@ -434,6 +435,8 @@ protected: return "Generic"; case OMP_TGT_EXEC_MODE_GENERIC_SPMD: return "Generic-SPMD"; + case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP: + return "SPMD-No-Loop"; } llvm_unreachable("Unknown execution mode!"); } @@ -471,7 +474,8 @@ private: uint32_t BlockLimitClause[3], uint64_t LoopTripCount, uint32_t &NumThreads, bool IsNumThreadsFromUser) const; - /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. + /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop + /// or SPMD mode. bool isGenericSPMDMode() const { return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD; @@ -486,6 +490,10 @@ private: bool isBareMode() const { return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE; } + bool isNoLoopMode() const { + return KernelEnvironment.Configuration.ExecMode == + OMP_TGT_EXEC_MODE_SPMD_NO_LOOP; + } /// The kernel name. std::string Name; @@ -831,11 +839,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { Error unloadBinary(DeviceImageTy *Image); virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0; - /// Setup the device environment if needed. Notice this setup may not be run - /// on some plugins. By default, it will be executed, but plugins can change - /// this behavior by overriding the shouldSetupDeviceEnvironment function. - Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); - /// Setup the global device memory pool, if the plugin requires one. Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, uint64_t PoolSize); @@ -1035,6 +1038,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy { uint32_t getDefaultNumBlocks() const { return GridValues.GV_Default_Num_Teams; } + uint32_t getDebugKind() const { return OMPX_DebugKind; } uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } @@ -1175,11 +1179,6 @@ private: virtual Error getDeviceHeapSize(uint64_t &V) = 0; virtual Error setDeviceHeapSize(uint64_t V) = 0; - /// Indicate whether the device should setup the device environment. Notice - /// that returning false in this function will change the behavior of the - /// setupDeviceEnvironment() function. - virtual bool shouldSetupDeviceEnvironment() const { return true; } - /// Indicate whether the device should setup the global device memory pool. If /// false is return the value on the device will be uninitialized. virtual bool shouldSetupDeviceMemoryPool() const { return true; } @@ -1235,7 +1234,7 @@ protected: enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING }; /// Array of peer access states with the rest of devices. This means that if - /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE, + /// the device I has a matrix PeerAccesses with PeerAccesses == AVAILABLE, /// the device I can access device J's memory directly. However, notice this /// does not mean that device J can access device I's memory directly. llvm::SmallVector<PeerAccessState> PeerAccesses; diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index d4b5f914c667..36cdd6035e26 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -371,54 +371,6 @@ public: }; } // namespace llvm::omp::target::plugin -// Extract the mapping of host function pointers to device function pointers -// from the entry table. Functions marked as 'indirect' in OpenMP will have -// offloading entries generated for them which map the host's function pointer -// to a global containing the corresponding function pointer on the device. -static Expected<std::pair<void *, uint64_t>> -setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device, - DeviceImageTy &Image) { - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - - llvm::ArrayRef<llvm::offloading::EntryTy> Entries( - Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd); - llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable; - for (const auto &Entry : Entries) { - if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 || - !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT)) - continue; - - assert(Entry.Size == sizeof(void *) && "Global not a function pointer?"); - auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); - - GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size); - if (auto Err = - Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) - return std::move(Err); - - HstPtr = Entry.Address; - if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(), - Entry.Size, nullptr)) - return std::move(Err); - } - - // If we do not have any indirect globals we exit early. - if (IndirectCallTable.empty()) - return std::pair{nullptr, 0}; - - // Sort the array to allow for more efficient lookup of device pointers. - llvm::sort(IndirectCallTable, - [](const auto &x, const auto &y) { return x.first < y.first; }); - - uint64_t TableSize = - IndirectCallTable.size() * sizeof(std::pair<void *, void *>); - void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE); - if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(), - TableSize, nullptr)) - return std::move(Err); - return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size()); -} - AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr) : Device(Device), @@ -662,6 +614,10 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit()); } + // Return the number of teams required to cover the loop iterations. + if (isNoLoopMode()) + return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1; + uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks(); uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max(); if (LoopTripCount > 0) { @@ -939,10 +895,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, // Add the image to list. LoadedImages.push_back(Image); - // Setup the device environment if needed. - if (auto Err = setupDeviceEnvironment(Plugin, *Image)) - return std::move(Err); - // Setup the global device memory pool if needed. if (!Plugin.getRecordReplay().isReplaying() && shouldSetupDeviceMemoryPool()) { @@ -978,43 +930,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return Image; } -Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, - DeviceImageTy &Image) { - // There are some plugins that do not need this step. - if (!shouldSetupDeviceEnvironment()) - return Plugin::success(); - - // Obtain a table mapping host function pointers to device function pointers. - auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image); - if (!CallTablePairOrErr) - return CallTablePairOrErr.takeError(); - - DeviceEnvironmentTy DeviceEnvironment; - DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind; - DeviceEnvironment.NumDevices = Plugin.getNumDevices(); - // TODO: The device ID used here is not the real device ID used by OpenMP. - DeviceEnvironment.DeviceNum = DeviceId; - DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; - DeviceEnvironment.ClockFrequency = getClockFrequency(); - DeviceEnvironment.IndirectCallTable = - reinterpret_cast<uintptr_t>(CallTablePairOrErr->first); - DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second; - DeviceEnvironment.HardwareParallelism = getHardwareParallelism(); - - // Create the metainfo of the device environment global. - GlobalTy DevEnvGlobal("__omp_rtl_device_environment", - sizeof(DeviceEnvironmentTy), &DeviceEnvironment); - - // Write device environment values to the device. - GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); - if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) { - DP("Missing symbol %s, continue execution anyway.\n", - DevEnvGlobal.getName().data()); - consumeError(std::move(Err)); - } - return Plugin::success(); -} - Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, uint64_t PoolSize) { @@ -1337,16 +1252,19 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) { Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue) { + if (!AsyncInfo) + return Plugin::error(ErrorCode::INVALID_ARGUMENT, + "invalid async info queue"); + SmallVector<void *> AllocsToDelete{}; { std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex}; - if (!AsyncInfo || !AsyncInfo->Queue) - return Plugin::error(ErrorCode::INVALID_ARGUMENT, - "invalid async info queue"); - - if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue)) - return Err; + // This can be false when no work has been added to the AsyncInfo. In which + // case, the device has nothing to synchronize. + if (AsyncInfo->Queue) + if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue)) + return Err; std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations); } @@ -2252,8 +2170,7 @@ int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size, GenericGlobalHandlerTy &GHandler = getGlobalHandler(); if (auto Err = GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) { - REPORT("Failure to look up global address: %s\n", - toString(std::move(Err)).data()); + consumeError(std::move(Err)); return OFFLOAD_FAIL; } diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index bf335ab20f75..af3c74636bff 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1060,8 +1060,10 @@ struct CUDADeviceTy : public GenericDeviceTy { Info.add("CUDA OpenMP Device Number", DeviceId); Res = cuDeviceGetName(TmpChar, 1000, Device); - if (Res == CUDA_SUCCESS) + if (Res == CUDA_SUCCESS) { Info.add("Device Name", TmpChar, "", DeviceInfo::NAME); + Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME); + } Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR); @@ -1118,7 +1120,13 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) MaxBlock.add("z", TmpInt); - auto &MaxGrid = *Info.add("Maximum Grid Dimensions", ""); + // TODO: I assume CUDA devices have no limit on the amount of threads, + // verify this + Info.add("Maximum Grid Size", std::numeric_limits<uint32_t>::max(), "", + DeviceInfo::MAX_WORK_SIZE); + + auto &MaxGrid = *Info.add("Maximum Grid Dimensions", std::monostate{}, "", + DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt); if (Res == CUDA_SUCCESS) MaxGrid.add("x", TmpInt); @@ -1444,7 +1452,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem); if (auto Err = Plugin::check( AttrResult, - "Error in cuLaunchKernel while setting the memory limits: %s")) + "error in cuFuncSetAttribute while setting the memory limits: %s")) return Err; MaxDynCGroupMemLimit = MaxDynCGroupMem; } diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index f440ebaf17fe..5436cae3b029 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -387,7 +387,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy { } /// This plugin should not setup the device environment or memory pool. - virtual bool shouldSetupDeviceEnvironment() const override { return false; }; virtual bool shouldSetupDeviceMemoryPool() const override { return false; }; /// Getters and setters for stack size and heap size not relevant. |
