summaryrefslogtreecommitdiff
path: root/offload/plugins-nextgen/common
diff options
context:
space:
mode:
authorMingming Liu <mingmingl@google.com>2025-09-10 15:25:31 -0700
committerGitHub <noreply@github.com>2025-09-10 15:25:31 -0700
commit1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree57f4b1f313c8cf74eed8819870f39c36ea263c68 /offload/plugins-nextgen/common
parent898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parentb8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'offload/plugins-nextgen/common')
-rw-r--r--offload/plugins-nextgen/common/include/PluginInterface.h23
-rw-r--r--offload/plugins-nextgen/common/src/PluginInterface.cpp111
2 files changed, 25 insertions, 109 deletions
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 75f87cab6049..6ff3ef8cda17 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -417,6 +417,7 @@ struct GenericKernelTy {
case OMP_TGT_EXEC_MODE_SPMD:
case OMP_TGT_EXEC_MODE_GENERIC:
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+ case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
return true;
}
return false;
@@ -434,6 +435,8 @@ protected:
return "Generic";
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return "Generic-SPMD";
+ case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
+ return "SPMD-No-Loop";
}
llvm_unreachable("Unknown execution mode!");
}
@@ -471,7 +474,8 @@ private:
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
- /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+ /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop
+ /// or SPMD mode.
bool isGenericSPMDMode() const {
return KernelEnvironment.Configuration.ExecMode ==
OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -486,6 +490,10 @@ private:
bool isBareMode() const {
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE;
}
+ bool isNoLoopMode() const {
+ return KernelEnvironment.Configuration.ExecMode ==
+ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
+ }
/// The kernel name.
std::string Name;
@@ -831,11 +839,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error unloadBinary(DeviceImageTy *Image);
virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
- /// Setup the device environment if needed. Notice this setup may not be run
- /// on some plugins. By default, it will be executed, but plugins can change
- /// this behavior by overriding the shouldSetupDeviceEnvironment function.
- Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
-
/// Setup the global device memory pool, if the plugin requires one.
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
uint64_t PoolSize);
@@ -1035,6 +1038,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
uint32_t getDefaultNumBlocks() const {
return GridValues.GV_Default_Num_Teams;
}
+ uint32_t getDebugKind() const { return OMPX_DebugKind; }
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
@@ -1175,11 +1179,6 @@ private:
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;
- /// Indicate whether the device should setup the device environment. Notice
- /// that returning false in this function will change the behavior of the
- /// setupDeviceEnvironment() function.
- virtual bool shouldSetupDeviceEnvironment() const { return true; }
-
/// Indicate whether the device should setup the global device memory pool. If
/// false is return the value on the device will be uninitialized.
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
@@ -1235,7 +1234,7 @@ protected:
enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
/// Array of peer access states with the rest of devices. This means that if
- /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
+ /// the device I has a matrix PeerAccesses with PeerAccesses == AVAILABLE,
/// the device I can access device J's memory directly. However, notice this
/// does not mean that device J can access device I's memory directly.
llvm::SmallVector<PeerAccessState> PeerAccesses;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index d4b5f914c667..36cdd6035e26 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -371,54 +371,6 @@ public:
};
} // namespace llvm::omp::target::plugin
-// Extract the mapping of host function pointers to device function pointers
-// from the entry table. Functions marked as 'indirect' in OpenMP will have
-// offloading entries generated for them which map the host's function pointer
-// to a global containing the corresponding function pointer on the device.
-static Expected<std::pair<void *, uint64_t>>
-setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
- DeviceImageTy &Image) {
- GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-
- llvm::ArrayRef<llvm::offloading::EntryTy> Entries(
- Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd);
- llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
- for (const auto &Entry : Entries) {
- if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 ||
- !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
- continue;
-
- assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
- auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
-
- GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size);
- if (auto Err =
- Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
- return std::move(Err);
-
- HstPtr = Entry.Address;
- if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
- Entry.Size, nullptr))
- return std::move(Err);
- }
-
- // If we do not have any indirect globals we exit early.
- if (IndirectCallTable.empty())
- return std::pair{nullptr, 0};
-
- // Sort the array to allow for more efficient lookup of device pointers.
- llvm::sort(IndirectCallTable,
- [](const auto &x, const auto &y) { return x.first < y.first; });
-
- uint64_t TableSize =
- IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
- void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
- if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
- TableSize, nullptr))
- return std::move(Err);
- return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
-}
-
AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
__tgt_async_info *AsyncInfoPtr)
: Device(Device),
@@ -662,6 +614,10 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
}
+ // Return the number of teams required to cover the loop iterations.
+ if (isNoLoopMode())
+ return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1;
+
uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
if (LoopTripCount > 0) {
@@ -939,10 +895,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
// Add the image to list.
LoadedImages.push_back(Image);
- // Setup the device environment if needed.
- if (auto Err = setupDeviceEnvironment(Plugin, *Image))
- return std::move(Err);
-
// Setup the global device memory pool if needed.
if (!Plugin.getRecordReplay().isReplaying() &&
shouldSetupDeviceMemoryPool()) {
@@ -978,43 +930,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
return Image;
}
-Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
- DeviceImageTy &Image) {
- // There are some plugins that do not need this step.
- if (!shouldSetupDeviceEnvironment())
- return Plugin::success();
-
- // Obtain a table mapping host function pointers to device function pointers.
- auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
- if (!CallTablePairOrErr)
- return CallTablePairOrErr.takeError();
-
- DeviceEnvironmentTy DeviceEnvironment;
- DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
- DeviceEnvironment.NumDevices = Plugin.getNumDevices();
- // TODO: The device ID used here is not the real device ID used by OpenMP.
- DeviceEnvironment.DeviceNum = DeviceId;
- DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
- DeviceEnvironment.ClockFrequency = getClockFrequency();
- DeviceEnvironment.IndirectCallTable =
- reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
- DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
- DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
-
- // Create the metainfo of the device environment global.
- GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
- sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
-
- // Write device environment values to the device.
- GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
- if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
- DP("Missing symbol %s, continue execution anyway.\n",
- DevEnvGlobal.getName().data());
- consumeError(std::move(Err));
- }
- return Plugin::success();
-}
-
Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
DeviceImageTy &Image,
uint64_t PoolSize) {
@@ -1337,16 +1252,19 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
bool ReleaseQueue) {
+ if (!AsyncInfo)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "invalid async info queue");
+
SmallVector<void *> AllocsToDelete{};
{
std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
- if (!AsyncInfo || !AsyncInfo->Queue)
- return Plugin::error(ErrorCode::INVALID_ARGUMENT,
- "invalid async info queue");
-
- if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
- return Err;
+ // This can be false when no work has been added to the AsyncInfo. In which
+ // case, the device has nothing to synchronize.
+ if (AsyncInfo->Queue)
+ if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
+ return Err;
std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations);
}
@@ -2252,8 +2170,7 @@ int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
GenericGlobalHandlerTy &GHandler = getGlobalHandler();
if (auto Err =
GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
- REPORT("Failure to look up global address: %s\n",
- toString(std::move(Err)).data());
+ consumeError(std::move(Err));
return OFFLOAD_FAIL;
}