summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp240
1 files changed, 240 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
new file mode 100644
index 000000000000..89f6b38df9d5
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -0,0 +1,240 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower LDS global variables with target extension type "amdgpu.named.barrier"
+// that require specialized address assignment. It assigns a unique
+// barrier identifier to each named-barrier LDS variable and encodes
+// this identifier within the !absolute_symbol metadata of that global.
+// This encoding ensures that subsequent LDS lowering passes can process these
+// barriers correctly without conflicts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-exec-sync"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF) {
+ bool NeedsReplacement = false;
+ for (Use &U : GV->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (isKernelLDS(F) && F != KF) {
+ NeedsReplacement = true;
+ break;
+ }
+ }
+ }
+ if (!NeedsReplacement)
+ return GV;
+ // Create a new GV used only by this kernel and its function
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+ GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+ GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+ NewGV->copyAttributesFrom(GV);
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (!isKernelLDS(F) || F == KF) {
+ U.getUser()->replaceUsesOfWith(GV, NewGV);
+ }
+ }
+ }
+ return NewGV;
+}
+
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+ uint32_t Address) {
+ LLVMContext &Ctx = M->getContext();
+ auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
+ sort(V, [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {std::move(V)};
+}
+
+// Main utility function for special LDS variables lowering.
+static bool lowerExecSyncGlobalVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+ // The 1st round: give module-absolute assignments
+ int NumAbsolutes = 0;
+ SmallVector<GlobalVariable *> OrderedGVs;
+ for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ GlobalVariable *GV = K.first;
+ if (!isNamedBarrier(*GV))
+ continue;
+ // give a module-absolute assignment if it is indirectly accessed by
+ // multiple kernels. This is not precise, but we don't want to duplicate
+ // a function when it is called by multiple kernels.
+ if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ OrderedGVs.push_back(GV);
+ } else {
+ // leave it to the 2nd round, which will give a kernel-relative
+ // assignment if it is only indirectly accessed by one kernel
+ LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+ }
+ LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ NumAbsolutes += BarCnt;
+
+ // 4 bits for alignment, 5 bits for the barrier num,
+ // 3 bits for the barrier scope
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, GV, Offset);
+ }
+ OrderedGVs.clear();
+
+ // The 2nd round: give a kernel-relative assignment for GV that
+ // either only indirectly accessed by single kernel or only directly
+ // accessed by multiple kernels.
+ SmallVector<Function *> OrderedKernels;
+ for (auto &K : LDSUsesInfo.direct_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ OrderedKernels.push_back(F);
+ }
+ OrderedKernels = sortByName(std::move(OrderedKernels));
+
+ DenseMap<Function *, uint32_t> Kernel2BarId;
+ for (Function *F : OrderedKernels) {
+ for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+ if (!isNamedBarrier(*GV))
+ continue;
+
+ LDSUsesInfo.direct_access[F].erase(GV);
+ if (GV->isAbsoluteSymbolRef()) {
+ // already assigned
+ continue;
+ }
+ OrderedGVs.push_back(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ // GV could also be used directly by other kernels. If so, we need to
+ // create a new GV used only by this kernel and its function.
+ auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ Changed |= (NewGV != GV);
+ unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = Kernel2BarId[F];
+ BarId += NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ Kernel2BarId[F] += BarCnt;
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ }
+ OrderedGVs.clear();
+ }
+ // Also erase those special LDS variables from indirect_access.
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ assert(isKernelLDS(K.first));
+ for (GlobalVariable *GV : K.second) {
+ if (isNamedBarrier(*GV))
+ K.second.erase(GV);
+ }
+ }
+ return Changed;
+}
+
+static bool runLowerExecSyncGlobals(Module &M) {
+ CallGraph CG = CallGraph(M);
+ bool Changed = false;
+ Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+ // For each kernel, what variables does it access directly or through
+ // callees
+ LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+ // For each variable accessed through callees, which kernels access it
+ VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ for (GlobalVariable *GV : K.second) {
+ LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ }
+ }
+
+ if (LDSUsesInfo.HasSpecialGVs) {
+ // Special LDS variables need special address assignment
+ Changed |= lowerExecSyncGlobalVariables(
+ M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+ }
+ return Changed;
+}
+
+class AMDGPULowerExecSyncLegacy : public ModulePass {
+public:
+ static char ID;
+ AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
+ bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerExecSyncLegacy::ID = 0;
+char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of execution synchronization", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of execution synchronization", false,
+ false)
+
+bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
+ return runLowerExecSyncGlobals(M);
+}
+
+ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
+ return new AMDGPULowerExecSyncLegacy();
+}
+
+PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}