diff options
| author | Mingming Liu <mingmingl@google.com> | 2025-09-10 15:25:31 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-10 15:25:31 -0700 |
| commit | 1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch) | |
| tree | 57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | |
| parent | 898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff) | |
| parent | b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff) | |
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 31 |
1 files changed, 22 insertions, 9 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 846a0b6280f1..3e2b2c351056 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" @@ -1003,6 +1004,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { DstAS == AMDGPUAS::FLAT_ADDRESS && ST->hasGloballyAddressableScratch(); } + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: { + const Function *F = Intrinsic->getFunction(); + bool HasUniformYZ = + ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true); + std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize( + *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2); + return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1); + } default: return AMDGPU::isIntrinsicSourceOfDivergence(IID); } @@ -1049,28 +1059,31 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { // packed into a same wave which gives 1 and 0 after the division by 64 // respectively. // - // FIXME: limit it to 1D kernels only, although that shall be possible - // to perform this optimization is the size of the X dimension is a power - // of 2, we just do not currently have infrastructure to query it. + // The X dimension doesn't reset within a wave if either both the Y + // and Z dimensions are of length 1, or if the X dimension's required + // size is a power of 2. Note, however, if the X dimension's maximum + // size is a power of 2 < the wavefront size, division by the wavefront + // size is guaranteed to yield 0, so this is also a no-reset case. + bool XDimDoesntResetWithinWaves = false; + if (auto *I = dyn_cast<Instruction>(V)) { + const Function *F = I->getFunction(); + XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F); + } using namespace llvm::PatternMatch; uint64_t C; if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), m_ConstantInt(C))) || match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), m_ConstantInt(C)))) { - const Function *F = cast<Instruction>(V)->getFunction(); - return C >= ST->getWavefrontSizeLog2() && - ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; + return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves; } Value *Mask; if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), m_Value(Mask)))) { - const Function *F = cast<Instruction>(V)->getFunction(); - const DataLayout &DL = F->getDataLayout(); return computeKnownBits(Mask, DL).countMinTrailingZeros() >= ST->getWavefrontSizeLog2() && - ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; + XDimDoesntResetWithinWaves; } const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V); |
