summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorLuke Lau <luke@igalia.com>2025-11-10 20:10:40 +0800
committerGitHub <noreply@github.com>2025-11-10 12:10:40 +0000
commitbfd4155f234b4e7f826cb57cad7e9876acfac046 (patch)
tree5e1783aabe99739580d3c14b014032dc443d6bdc /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent1ffe79d092909a2075705a10d932f0af0825577b (diff)
[VPlan] Don't apply predication discount to non-originally-predicated blocks (#160449)
Split off from #158690. Currently if an instruction needs predicated due to tail folding, it will also have a predicated discount applied to it in multiple places. This is likely inaccurate because we can expect a tail folded instruction to be executed on every iteration bar the last. This fixes it by checking if the instruction/block was originally predicated, and in doing so prevents vectorization with tail folding where we would have had to scalarize the memory op anyway. On llvm-test-suite this causes 4 loops in total to no longer be vectorized with -O3 on arm64-apple-darwin, and there's no observable performance impact.
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp44
1 files changed, 37 insertions, 7 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 45b557026141..566d6eafee63 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1232,6 +1232,30 @@ public:
/// Superset of instructions that return true for isScalarWithPredication.
bool isPredicatedInst(Instruction *I) const;
+ /// A helper function that returns how much we should divide the cost of a
+ /// predicated block by. Typically this is the reciprocal of the block
+ /// probability, i.e. if we return X we are assuming the predicated block will
+ /// execute once for every X iterations of the loop header so the block should
+ /// only contribute 1/X of its cost to the total cost calculation, but when
+ /// optimizing for code size it will just be 1 as code size costs don't depend
+ /// on execution probabilities.
+ ///
+ /// TODO: We should use actual block probability here, if available.
+ /// Currently, we always assume predicated blocks have a 50% chance of
+ /// executing, apart from blocks that are only predicated due to tail folding.
+ inline unsigned
+ getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
+ BasicBlock *BB) const {
+ // If a block wasn't originally predicated but was predicated due to
+ // e.g. tail folding, don't divide the cost. Tail folded loops may still be
+ // predicated in the final vector loop iteration, but for most loops that
+ // don't have low trip counts we can expect their probability to be close to
+ // zero.
+ if (!Legal->blockNeedsPredication(BB))
+ return 1;
+ return CostKind == TTI::TCK_CodeSize ? 1 : 2;
+ }
+
/// Return the costs for our two available strategies for lowering a
/// div/rem operation which requires speculating at least one lane.
/// First result is for scalarization (will be invalid for scalable
@@ -2887,7 +2911,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
// likely.
- ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
+ ScalarizationCost =
+ ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
}
InstructionCost SafeDivisorCost = 0;
@@ -5032,7 +5057,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
}
// Scale the total scalar cost by block probability.
- ScalarCost /= getPredBlockCostDivisor(CostKind);
+ ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
// Compute the discount. A non-negative discount means the vector version
// of the instruction costs more, and scalarizing would be beneficial.
@@ -5082,10 +5107,11 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
// stores and instructions that may divide by zero) will now be
// unconditionally executed. For the scalar case, we may not always execute
// the predicated block, if it is an if-else block. Thus, scale the block's
- // cost by the probability of executing it. blockNeedsPredication from
- // Legal is used so as to not include all blocks in tail folded loops.
- if (VF.isScalar() && Legal->blockNeedsPredication(BB))
- BlockCost /= getPredBlockCostDivisor(CostKind);
+ // cost by the probability of executing it.
+ // getPredBlockCostDivisor will return 1 for blocks that are only predicated
+ // by the header mask when folding the tail.
+ if (VF.isScalar())
+ BlockCost /= getPredBlockCostDivisor(CostKind, BB);
Cost += BlockCost;
}
@@ -5164,7 +5190,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// conditional branches, but may not be executed for each vector lane. Scale
// the cost by the probability of executing the predicated block.
if (isPredicatedInst(I)) {
- Cost /= getPredBlockCostDivisor(CostKind);
+ Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
// Add the cost of an i1 extract and a branch
auto *VecI1Ty =
@@ -6732,6 +6758,10 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
SkipCostComputation.contains(UI);
}
+unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
+ return CM.getPredBlockCostDivisor(CostKind, BB);
+}
+
InstructionCost
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
VPCostContext &CostCtx) const {