summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
diff options
context:
space:
mode:
authorMingming Liu <mingmingl@google.com>2025-09-10 15:25:31 -0700
committerGitHub <noreply@github.com>2025-09-10 15:25:31 -0700
commit1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
parent898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parentb8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp31
1 files changed, 22 insertions, 9 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 846a0b6280f1..3e2b2c351056 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -22,6 +22,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
@@ -1003,6 +1004,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
DstAS == AMDGPUAS::FLAT_ADDRESS &&
ST->hasGloballyAddressableScratch();
}
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::amdgcn_workitem_id_z: {
+ const Function *F = Intrinsic->getFunction();
+ bool HasUniformYZ =
+ ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
+ std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
+ *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
+ return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
+ }
default:
return AMDGPU::isIntrinsicSourceOfDivergence(IID);
}
@@ -1049,28 +1059,31 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
// packed into a same wave which gives 1 and 0 after the division by 64
// respectively.
//
- // FIXME: limit it to 1D kernels only, although that shall be possible
- // to perform this optimization is the size of the X dimension is a power
- // of 2, we just do not currently have infrastructure to query it.
+ // The X dimension doesn't reset within a wave if either both the Y
+ // and Z dimensions are of length 1, or if the X dimension's required
+ // size is a power of 2. Note, however, if the X dimension's maximum
+ // size is a power of 2 < the wavefront size, division by the wavefront
+ // size is guaranteed to yield 0, so this is also a no-reset case.
+ bool XDimDoesntResetWithinWaves = false;
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ const Function *F = I->getFunction();
+ XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
+ }
using namespace llvm::PatternMatch;
uint64_t C;
if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_ConstantInt(C))) ||
match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_ConstantInt(C)))) {
- const Function *F = cast<Instruction>(V)->getFunction();
- return C >= ST->getWavefrontSizeLog2() &&
- ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+ return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
}
Value *Mask;
if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_Value(Mask)))) {
- const Function *F = cast<Instruction>(V)->getFunction();
- const DataLayout &DL = F->getDataLayout();
return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
ST->getWavefrontSizeLog2() &&
- ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+ XDimDoesntResetWithinWaves;
}
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);