summaryrefslogtreecommitdiff
path: root/offload/plugins-nextgen
diff options
context:
space:
mode:
Diffstat (limited to 'offload/plugins-nextgen')
-rw-r--r--offload/plugins-nextgen/amdgpu/src/rtl.cpp120
-rw-r--r--offload/plugins-nextgen/common/include/PluginInterface.h23
-rw-r--r--offload/plugins-nextgen/common/src/PluginInterface.cpp111
-rw-r--r--offload/plugins-nextgen/cuda/src/rtl.cpp14
-rw-r--r--offload/plugins-nextgen/host/src/rtl.cpp1
5 files changed, 128 insertions, 141 deletions
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 7ba55715ff58..c26cfe961aa0 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -924,6 +924,7 @@ private:
void *Dst;
const void *Src;
size_t Size;
+ size_t NumTimes;
};
/// Utility struct holding arguments for freeing buffers to memory managers.
@@ -974,9 +975,14 @@ private:
StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
/// Schedule a host memory copy action on the slot.
- Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
+ ///
+ /// Num times will repeat the copy that many times, sequentually in the dest
+ /// buffer.
+ Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size,
+ size_t NumTimes = 1) {
Callbacks.emplace_back(memcpyAction);
- ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
+ ActionArgs.emplace_back().MemcpyArgs =
+ MemcpyArgsTy{Dst, Src, Size, NumTimes};
return Plugin::success();
}
@@ -1216,7 +1222,11 @@ private:
assert(Args->Dst && "Invalid destination buffer");
assert(Args->Src && "Invalid source buffer");
- std::memcpy(Args->Dst, Args->Src, Args->Size);
+ auto BasePtr = Args->Dst;
+ for (size_t I = 0; I < Args->NumTimes; I++) {
+ std::memcpy(BasePtr, Args->Src, Args->Size);
+ BasePtr = reinterpret_cast<uint8_t *>(BasePtr) + Args->Size;
+ }
return Plugin::success();
}
@@ -1421,7 +1431,8 @@ public:
/// manager once the operation completes.
Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
uint64_t CopySize,
- AMDGPUMemoryManagerTy &MemoryManager) {
+ AMDGPUMemoryManagerTy &MemoryManager,
+ size_t NumTimes = 1) {
// Retrieve available signals for the operation's outputs.
AMDGPUSignalTy *OutputSignals[2] = {};
if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
@@ -1443,7 +1454,8 @@ public:
// The std::memcpy is done asynchronously using an async handler. We store
// the function's information in the action but it is not actually a
// post action.
- if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
+ if (auto Err =
+ Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes))
return Err;
// Make changes on this slot visible to the async handler's thread.
@@ -1464,7 +1476,11 @@ public:
std::tie(Curr, InputSignal) = consume(OutputSignal);
} else {
// All preceding operations completed, copy the memory synchronously.
- std::memcpy(Inter, Src, CopySize);
+ auto *InterPtr = Inter;
+ for (size_t I = 0; I < NumTimes; I++) {
+ std::memcpy(InterPtr, Src, CopySize);
+ InterPtr = reinterpret_cast<uint8_t *>(InterPtr) + CopySize;
+ }
// Return the second signal because it will not be used.
OutputSignals[1]->decreaseUseCount();
@@ -1481,11 +1497,11 @@ public:
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
- Agent, CopySize, 1, &InputSignalRaw,
- OutputSignal->get());
+ Agent, CopySize * NumTimes, 1,
+ &InputSignalRaw, OutputSignal->get());
}
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
- Agent, CopySize, 0, nullptr,
+ Agent, CopySize * NumTimes, 0, nullptr,
OutputSignal->get());
}
@@ -2611,26 +2627,73 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
- hsa_status_t Status;
+ // Fast case, where we can use the 4 byte hsa_amd_memory_fill
+ if (Size % 4 == 0 &&
+ (PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) {
+ uint32_t Pattern;
+ if (PatternSize == 1) {
+ auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr);
+ Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24;
+ } else if (PatternSize == 2) {
+ auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr);
+ Pattern = *Word | (*Word << 16);
+ } else if (PatternSize == 4) {
+ Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr);
+ } else {
+ // Shouldn't be here if the pattern size is outwith those values
+ llvm_unreachable("Invalid pattern size");
+ }
- // We can use hsa_amd_memory_fill for this size, but it's not async so the
- // queue needs to be synchronized first
- if (PatternSize == 4) {
- if (AsyncInfoWrapper.hasQueue())
- if (auto Err = synchronize(AsyncInfoWrapper))
+ if (hasPendingWorkImpl(AsyncInfoWrapper)) {
+ AMDGPUStreamTy *Stream = nullptr;
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
- Status = hsa_amd_memory_fill(TgtPtr,
- *static_cast<const uint32_t *>(PatternPtr),
- Size / PatternSize);
- if (auto Err =
- Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
- return Err;
- } else {
- // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
- // memory and copying to the device in one go.
- return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
+ struct MemFillArgsTy {
+ void *Dst;
+ uint32_t Pattern;
+ int64_t Size;
+ };
+ auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4};
+ auto Fill = [](void *Data) {
+ MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data);
+ assert(Args && "Invalid arguments");
+
+ auto Status =
+ hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size);
+ delete Args;
+ auto Err =
+ Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+ if (Err) {
+ FATAL_MESSAGE(1, "error performing async fill: %s",
+ toString(std::move(Err)).data());
+ }
+ };
+
+ // hsa_amd_memory_fill doesn't signal completion using a signal, so use
+ // the existing host callback logic to handle that instead
+ return Stream->pushHostCallback(Fill, Args);
+ } else {
+ // If there is no pending work, do the fill synchronously
+ auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4);
+ return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+ }
}
+
+ // Slow case; allocate an appropriate memory size and enqueue copies
+ void *PinnedPtr = nullptr;
+ AMDGPUMemoryManagerTy &PinnedMemoryManager =
+ HostDevice.getPinnedMemoryManager();
+ if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
+ return Err;
+
+ AMDGPUStreamTy *Stream = nullptr;
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+
+ return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr,
+ PatternSize, PinnedMemoryManager,
+ Size / PatternSize);
}
/// Initialize the async info for interoperability purposes.
@@ -2744,7 +2807,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
if (Status == HSA_STATUS_SUCCESS)
- Info.add("Product Name", TmpChar);
+ Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
if (Status == HSA_STATUS_SUCCESS)
@@ -2861,11 +2924,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
- Info.add("Grid Max Size", TmpUInt);
+ Info.add("Grid Max Size", TmpUInt, "", DeviceInfo::MAX_WORK_SIZE);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
if (Status == HSA_STATUS_SUCCESS) {
- auto &MaxDim = *Info.add("Grid Max Size per Dimension");
+ auto &MaxDim = *Info.add("Grid Max Size per Dimension", std::monostate{},
+ "", DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION);
MaxDim.add("x", GridMaxDim.x);
MaxDim.add("y", GridMaxDim.y);
MaxDim.add("z", GridMaxDim.z);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 75f87cab6049..6ff3ef8cda17 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -417,6 +417,7 @@ struct GenericKernelTy {
case OMP_TGT_EXEC_MODE_SPMD:
case OMP_TGT_EXEC_MODE_GENERIC:
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+ case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
return true;
}
return false;
@@ -434,6 +435,8 @@ protected:
return "Generic";
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return "Generic-SPMD";
+ case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
+ return "SPMD-No-Loop";
}
llvm_unreachable("Unknown execution mode!");
}
@@ -471,7 +474,8 @@ private:
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
- /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+ /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop
+ /// or SPMD mode.
bool isGenericSPMDMode() const {
return KernelEnvironment.Configuration.ExecMode ==
OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -486,6 +490,10 @@ private:
bool isBareMode() const {
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE;
}
+ bool isNoLoopMode() const {
+ return KernelEnvironment.Configuration.ExecMode ==
+ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
+ }
/// The kernel name.
std::string Name;
@@ -831,11 +839,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error unloadBinary(DeviceImageTy *Image);
virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
- /// Setup the device environment if needed. Notice this setup may not be run
- /// on some plugins. By default, it will be executed, but plugins can change
- /// this behavior by overriding the shouldSetupDeviceEnvironment function.
- Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
-
/// Setup the global device memory pool, if the plugin requires one.
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
uint64_t PoolSize);
@@ -1035,6 +1038,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
uint32_t getDefaultNumBlocks() const {
return GridValues.GV_Default_Num_Teams;
}
+ uint32_t getDebugKind() const { return OMPX_DebugKind; }
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
@@ -1175,11 +1179,6 @@ private:
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;
- /// Indicate whether the device should setup the device environment. Notice
- /// that returning false in this function will change the behavior of the
- /// setupDeviceEnvironment() function.
- virtual bool shouldSetupDeviceEnvironment() const { return true; }
-
/// Indicate whether the device should setup the global device memory pool. If
/// false is return the value on the device will be uninitialized.
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
@@ -1235,7 +1234,7 @@ protected:
enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
/// Array of peer access states with the rest of devices. This means that if
- /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
+ /// the device I has a matrix PeerAccesses with PeerAccesses == AVAILABLE,
/// the device I can access device J's memory directly. However, notice this
/// does not mean that device J can access device I's memory directly.
llvm::SmallVector<PeerAccessState> PeerAccesses;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index d4b5f914c667..36cdd6035e26 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -371,54 +371,6 @@ public:
};
} // namespace llvm::omp::target::plugin
-// Extract the mapping of host function pointers to device function pointers
-// from the entry table. Functions marked as 'indirect' in OpenMP will have
-// offloading entries generated for them which map the host's function pointer
-// to a global containing the corresponding function pointer on the device.
-static Expected<std::pair<void *, uint64_t>>
-setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
- DeviceImageTy &Image) {
- GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-
- llvm::ArrayRef<llvm::offloading::EntryTy> Entries(
- Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd);
- llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
- for (const auto &Entry : Entries) {
- if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 ||
- !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
- continue;
-
- assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
- auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
-
- GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size);
- if (auto Err =
- Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
- return std::move(Err);
-
- HstPtr = Entry.Address;
- if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
- Entry.Size, nullptr))
- return std::move(Err);
- }
-
- // If we do not have any indirect globals we exit early.
- if (IndirectCallTable.empty())
- return std::pair{nullptr, 0};
-
- // Sort the array to allow for more efficient lookup of device pointers.
- llvm::sort(IndirectCallTable,
- [](const auto &x, const auto &y) { return x.first < y.first; });
-
- uint64_t TableSize =
- IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
- void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
- if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
- TableSize, nullptr))
- return std::move(Err);
- return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
-}
-
AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
__tgt_async_info *AsyncInfoPtr)
: Device(Device),
@@ -662,6 +614,10 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
}
+ // Return the number of teams required to cover the loop iterations.
+ if (isNoLoopMode())
+ return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1;
+
uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
if (LoopTripCount > 0) {
@@ -939,10 +895,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
// Add the image to list.
LoadedImages.push_back(Image);
- // Setup the device environment if needed.
- if (auto Err = setupDeviceEnvironment(Plugin, *Image))
- return std::move(Err);
-
// Setup the global device memory pool if needed.
if (!Plugin.getRecordReplay().isReplaying() &&
shouldSetupDeviceMemoryPool()) {
@@ -978,43 +930,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
return Image;
}
-Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
- DeviceImageTy &Image) {
- // There are some plugins that do not need this step.
- if (!shouldSetupDeviceEnvironment())
- return Plugin::success();
-
- // Obtain a table mapping host function pointers to device function pointers.
- auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
- if (!CallTablePairOrErr)
- return CallTablePairOrErr.takeError();
-
- DeviceEnvironmentTy DeviceEnvironment;
- DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
- DeviceEnvironment.NumDevices = Plugin.getNumDevices();
- // TODO: The device ID used here is not the real device ID used by OpenMP.
- DeviceEnvironment.DeviceNum = DeviceId;
- DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
- DeviceEnvironment.ClockFrequency = getClockFrequency();
- DeviceEnvironment.IndirectCallTable =
- reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
- DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
- DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
-
- // Create the metainfo of the device environment global.
- GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
- sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
-
- // Write device environment values to the device.
- GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
- if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
- DP("Missing symbol %s, continue execution anyway.\n",
- DevEnvGlobal.getName().data());
- consumeError(std::move(Err));
- }
- return Plugin::success();
-}
-
Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
DeviceImageTy &Image,
uint64_t PoolSize) {
@@ -1337,16 +1252,19 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
bool ReleaseQueue) {
+ if (!AsyncInfo)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "invalid async info queue");
+
SmallVector<void *> AllocsToDelete{};
{
std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
- if (!AsyncInfo || !AsyncInfo->Queue)
- return Plugin::error(ErrorCode::INVALID_ARGUMENT,
- "invalid async info queue");
-
- if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
- return Err;
+ // This can be false when no work has been added to the AsyncInfo. In which
+ // case, the device has nothing to synchronize.
+ if (AsyncInfo->Queue)
+ if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
+ return Err;
std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations);
}
@@ -2252,8 +2170,7 @@ int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
GenericGlobalHandlerTy &GHandler = getGlobalHandler();
if (auto Err =
GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
- REPORT("Failure to look up global address: %s\n",
- toString(std::move(Err)).data());
+ consumeError(std::move(Err));
return OFFLOAD_FAIL;
}
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index bf335ab20f75..af3c74636bff 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1060,8 +1060,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
Info.add("CUDA OpenMP Device Number", DeviceId);
Res = cuDeviceGetName(TmpChar, 1000, Device);
- if (Res == CUDA_SUCCESS)
+ if (Res == CUDA_SUCCESS) {
Info.add("Device Name", TmpChar, "", DeviceInfo::NAME);
+ Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME);
+ }
Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR);
@@ -1118,7 +1120,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (Res == CUDA_SUCCESS)
MaxBlock.add("z", TmpInt);
- auto &MaxGrid = *Info.add("Maximum Grid Dimensions", "");
+ // TODO: I assume CUDA devices have no limit on the amount of threads,
+ // verify this
+ Info.add("Maximum Grid Size", std::numeric_limits<uint32_t>::max(), "",
+ DeviceInfo::MAX_WORK_SIZE);
+
+ auto &MaxGrid = *Info.add("Maximum Grid Dimensions", std::monostate{}, "",
+ DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION);
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
if (Res == CUDA_SUCCESS)
MaxGrid.add("x", TmpInt);
@@ -1444,7 +1452,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
if (auto Err = Plugin::check(
AttrResult,
- "Error in cuLaunchKernel while setting the memory limits: %s"))
+ "error in cuFuncSetAttribute while setting the memory limits: %s"))
return Err;
MaxDynCGroupMemLimit = MaxDynCGroupMem;
}
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index f440ebaf17fe..5436cae3b029 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -387,7 +387,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
}
/// This plugin should not setup the device environment or memory pool.
- virtual bool shouldSetupDeviceEnvironment() const override { return false; };
virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
/// Getters and setters for stack size and heap size not relevant.