diff options
| author | Gang Chen <gangc@amd.com> | 2024-11-06 10:37:22 -0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-11-06 10:37:22 -0800 |
| commit | 8c752900dda82115ebb8231e6d5ac703e703547e (patch) | |
| tree | ce774d8fa97a3ca4560d55c96b70969a280706dc /llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | |
| parent | ffc2233395f0b1a3a0c277d196bb0a0ccae84ab7 (diff) | |
[AMDGPU] modify named barrier builtins and intrinsics (#114550)
Use a local pointer type to represent the named barrier in builtin and
intrinsic. This makes the definitions more user friendly
bacause they do not need to worry about the hardware ID assignment. Also
this approach is more like the other popular GPU programming language.
Named barriers should be represented as global variables of addrspace(3)
in LLVM-IR. Compiler assigns the special LDS offsets for those variables
during AMDGPULowerModuleLDS pass. Those addresses are converted to hw
barrier ID during instruction selection. The rest of the
instruction-selection changes are primarily due to the
intrinsic-definition changes.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 124 |
1 files changed, 124 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 5791daed0065..a76d92ee91ee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -920,6 +920,124 @@ public: return KernelToCreatedDynamicLDS; } + static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, + Function *KF) { + bool NeedsReplacement = false; + for (Use &U : GV->uses()) { + if (auto *I = dyn_cast<Instruction>(U.getUser())) { + Function *F = I->getFunction(); + if (isKernelLDS(F) && F != KF) { + NeedsReplacement = true; + break; + } + } + } + if (!NeedsReplacement) + return GV; + // Create a new GV used only by this kernel and its function + GlobalVariable *NewGV = new GlobalVariable( + M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), + GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); + NewGV->copyAttributesFrom(GV); + for (Use &U : make_early_inc_range(GV->uses())) { + if (auto *I = dyn_cast<Instruction>(U.getUser())) { + Function *F = I->getFunction(); + if (!isKernelLDS(F) || F == KF) { + U.getUser()->replaceUsesOfWith(GV, NewGV); + } + } + } + return NewGV; + } + + bool lowerSpecialLDSVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { + bool Changed = false; + // The 1st round: give module-absolute assignments + int NumAbsolutes = 0; + std::vector<GlobalVariable *> OrderedGVs; + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + GlobalVariable *GV = K.first; + if (!isNamedBarrier(*GV)) + continue; + // give a module-absolute assignment if it is indirectly accessed by + // multiple kernels. This is not precise, but we don't want to duplicate + // a function when it is called by multiple kernels. + if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { + OrderedGVs.push_back(GV); + } else { + // leave it to the 2nd round, which will give a kernel-relative + // assignment if it is only indirectly accessed by one kernel + LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); + } + LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + int BarId = ++NumAbsolutes; + unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + // 4 bits for alignment, 5 bits for the barrier num, + // 3 bits for the barrier scope + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, GV, Offset); + } + OrderedGVs.clear(); + + // The 2nd round: give a kernel-relative assignment for GV that + // either only indirectly accessed by single kernel or only directly + // accessed by multiple kernels. + std::vector<Function *> OrderedKernels; + for (auto &K : LDSUsesInfo.direct_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + OrderedKernels.push_back(F); + } + OrderedKernels = sortByName(std::move(OrderedKernels)); + + llvm::DenseMap<Function *, uint32_t> Kernel2BarId; + for (Function *F : OrderedKernels) { + for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { + if (!isNamedBarrier(*GV)) + continue; + + LDSUsesInfo.direct_access[F].erase(GV); + if (GV->isAbsoluteSymbolRef()) { + // already assigned + continue; + } + OrderedGVs.push_back(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + // GV could also be used directly by other kernels. If so, we need to + // create a new GV used only by this kernel and its function. + auto NewGV = uniquifyGVPerKernel(M, GV, F); + Changed |= (NewGV != GV); + int BarId = (NumAbsolutes + 1); + if (Kernel2BarId.find(F) != Kernel2BarId.end()) { + BarId = (Kernel2BarId[F] + 1); + } + Kernel2BarId[F] = BarId; + unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, NewGV, Offset); + } + OrderedGVs.clear(); + } + // Also erase those special LDS variables from indirect_access. + for (auto &K : LDSUsesInfo.indirect_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + for (GlobalVariable *GV : K.second) { + if (isNamedBarrier(*GV)) + K.second.erase(GV); + } + } + return Changed; + } + bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -942,6 +1060,12 @@ public: } } + if (LDSUsesInfo.HasSpecialGVs) { + // Special LDS variables need special address assignment + Changed |= lowerSpecialLDSVariables( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); + } + // Partition variables accessed indirectly into the different strategies DenseSet<GlobalVariable *> ModuleScopeVariables; DenseSet<GlobalVariable *> TableLookupVariables; |
