summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp642
1 files changed, 377 insertions, 265 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index af6fce4b1519..47866dac9ad9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -479,7 +479,8 @@ public:
AC(AC), ORE(ORE), VF(VecWidth),
MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
- PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+ VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -517,22 +518,6 @@ public:
/// Fix the non-induction PHIs in \p Plan.
void fixNonInductionPHIs(VPTransformState &State);
- /// Create a ResumePHI VPInstruction for the induction \p InductionPhiIRI to
- /// resume iteration count in the scalar epilogue from where the vectorized
- /// loop left off, and add it to the scalar preheader of VPlan. Also creates
- /// the induction resume value, and the value for the bypass block, if needed.
- /// \p Step is the SCEV-expanded induction step to use. In cases where the
- /// loop skeleton is more complicated (i.e., epilogue vectorization) and the
- /// resume values can come from an additional bypass block,
- /// \p MainVectorTripCount provides the trip count of the main vector loop,
- /// used to compute the resume value reaching the scalar loop preheader
- /// directly from this additional bypass block.
- void createInductionResumeVPValue(VPIRInstruction *InductionPhiIRI,
- const InductionDescriptor &ID, Value *Step,
- ArrayRef<BasicBlock *> BypassBlocks,
- VPBuilder &ScalarPHBuilder,
- Value *MainVectorTripCount = nullptr);
-
/// Returns the original loop trip count.
Value *getTripCount() const { return TripCount; }
@@ -588,23 +573,21 @@ protected:
/// vector loop preheader, middle block and scalar preheader.
void createVectorLoopSkeleton(StringRef Prefix);
- /// Create new phi nodes for the induction variables to resume iteration count
- /// in the scalar epilogue, from where the vectorized loop left off.
- /// In cases where the loop skeleton is more complicated (i.e. epilogue
- /// vectorization), \p MainVectorTripCount provides the trip count of the main
- /// loop, used to compute these resume values. If \p IVSubset is provided, it
- /// contains the phi nodes for which resume values are needed, because they
- /// will generate wide induction phis in the epilogue loop.
- void
- createInductionResumeVPValues(const SCEV2ValueTy &ExpandedSCEVs,
- Value *MainVectorTripCount = nullptr,
- SmallPtrSetImpl<PHINode *> *IVSubset = nullptr);
+ /// Create and record the values for induction variables to resume coming from
+ /// the additional bypass block.
+ void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
+ Value *MainVectorTripCount);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
virtual void printDebugTracesAtStart() {}
virtual void printDebugTracesAtEnd() {}
+ /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
+ /// vector preheader and its predecessor, also connecting the new block to the
+ /// scalar preheader.
+ void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
+
/// The original loop.
Loop *OrigLoop;
@@ -699,6 +682,10 @@ protected:
BasicBlock *AdditionalBypassBlock = nullptr;
VPlan &Plan;
+
+ /// The vector preheader block of \p Plan, used as target for check blocks
+ /// introduced during skeleton creation.
+ VPBlockBase *VectorPHVPB;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -1744,7 +1731,8 @@ private:
bool needsExtract(Value *V, ElementCount VF) const {
Instruction *I = dyn_cast<Instruction>(V);
if (VF.isScalar() || !I || !TheLoop->contains(I) ||
- TheLoop->isLoopInvariant(I))
+ TheLoop->isLoopInvariant(I) ||
+ getWideningDecision(I, VF) == CM_Scalarize)
return false;
// Assume we can vectorize V (and hence we need extraction) if the
@@ -2406,12 +2394,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
// End if-block.
VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
- assert((Parent || all_of(RepRecipe->operands(),
- [](VPValue *Op) {
- return Op->isDefinedOutsideLoopRegions();
- })) &&
- "Expected a recipe is either within a region or all of its operands "
- "are defined outside the vectorized region.");
+ assert(
+ (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
+ all_of(RepRecipe->operands(),
+ [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
+ "Expected a recipe is either within a region or all of its operands "
+ "are defined outside the vectorized region.");
if (IfPredicateInstr)
PredicatedInstructions.push_back(Cloned);
}
@@ -2466,19 +2454,15 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
return VectorTripCount;
}
-/// Introduces a new VPIRBasicBlock for \p CheckIRBB to \p Plan between the
-/// vector preheader and its predecessor, also connecting the new block to the
-/// scalar preheader.
-static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) {
+void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
- VPBlockBase *VectorPH = Plan.getVectorPreheader();
- VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
+ VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
if (PreVectorPH->getNumSuccessors() != 1) {
assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
"Unexpected successor");
- VPIRBasicBlock *CheckVPIRBB = VPIRBasicBlock::fromBasicBlock(CheckIRBB);
- VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckVPIRBB);
+ VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
+ VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
PreVectorPH = CheckVPIRBB;
}
VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
@@ -2567,7 +2551,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
LoopBypassBlocks.push_back(TCCheckBlock);
// TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
- introduceCheckBlockInVPlan(Plan, TCCheckBlock);
+ introduceCheckBlockInVPlan(TCCheckBlock);
}
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
@@ -2585,7 +2569,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
LoopBypassBlocks.push_back(SCEVCheckBlock);
AddedSafetyChecks = true;
- introduceCheckBlockInVPlan(Plan, SCEVCheckBlock);
+ introduceCheckBlockInVPlan(SCEVCheckBlock);
return SCEVCheckBlock;
}
@@ -2622,10 +2606,25 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
AddedSafetyChecks = true;
- introduceCheckBlockInVPlan(Plan, MemCheckBlock);
+ introduceCheckBlockInVPlan(MemCheckBlock);
return MemCheckBlock;
}
+/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
+/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
+/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
+/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
+static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+ VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
+ for (auto &R : make_early_inc_range(*VPBB)) {
+ assert(!R.isPhi() && "Tried to move phi recipe to end of block");
+ R.moveBefore(*IRVPBB, IRVPBB->end());
+ }
+
+ VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
+ // VPBB is now dead and will be cleaned up when the plan gets destroyed.
+}
+
void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
@@ -2636,64 +2635,11 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopMiddleBlock =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
LI, nullptr, Twine(Prefix) + "middle.block");
+ replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
LoopScalarPreHeader =
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
nullptr, Twine(Prefix) + "scalar.ph");
-}
-
-void InnerLoopVectorizer::createInductionResumeVPValue(
- VPIRInstruction *InductionPhiRI, const InductionDescriptor &II, Value *Step,
- ArrayRef<BasicBlock *> BypassBlocks, VPBuilder &ScalarPHBuilder,
- Value *MainVectorTripCount) {
- // TODO: Move to LVP or general VPlan construction, once no IR values are
- // generated.
- auto *OrigPhi = cast<PHINode>(&InductionPhiRI->getInstruction());
- Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
- assert(VectorTripCount && "Expected valid arguments");
-
- Instruction *OldInduction = Legal->getPrimaryInduction();
- // For the primary induction the end values are known.
- Value *EndValue = VectorTripCount;
- Value *EndValueFromAdditionalBypass = MainVectorTripCount;
- // Otherwise compute them accordingly.
- if (OrigPhi != OldInduction) {
- IRBuilder<> B(LoopVectorPreHeader->getTerminator());
-
- // Fast-math-flags propagate from the original induction instruction.
- if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
- B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
-
- EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
- Step, II.getKind(), II.getInductionBinOp());
- EndValue->setName("ind.end");
-
- // Compute the end value for the additional bypass (if applicable).
- if (MainVectorTripCount) {
- B.SetInsertPoint(getAdditionalBypassBlock(),
- getAdditionalBypassBlock()->getFirstInsertionPt());
- EndValueFromAdditionalBypass =
- emitTransformedIndex(B, MainVectorTripCount, II.getStartValue(), Step,
- II.getKind(), II.getInductionBinOp());
- EndValueFromAdditionalBypass->setName("ind.end");
- }
- }
-
- auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
- VPInstruction::ResumePhi,
- {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
- OrigPhi->getDebugLoc(), "bc.resume.val");
- assert(InductionPhiRI->getNumOperands() == 0 &&
- "InductionPhiRI should not have any operands");
- InductionPhiRI->addOperand(ResumePhiRecipe);
-
- if (EndValueFromAdditionalBypass) {
- // Store the bypass value here, as it needs to be added as operand to its
- // scalar preheader phi node after the epilogue skeleton has been created.
- // TODO: Directly add as extra operand to the VPResumePHI recipe.
- assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
- "entry for OrigPhi already exits");
- Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
- }
+ replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2733,46 +2679,40 @@ static void addFullyUnrolledInstructionsToIgnore(
}
}
-void InnerLoopVectorizer::createInductionResumeVPValues(
- const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
- SmallPtrSetImpl<PHINode *> *IVSubset) {
- // We are going to resume the execution of the scalar loop.
- // Go over all of the induction variable PHIs of the scalar loop header and
- // fix their starting values, which depend on the counter of the last
- // iteration of the vectorized loop. If we come from one of the
- // LoopBypassBlocks then we need to start from the original start value.
- // Otherwise we provide the trip count from the main vector loop.
- VPBasicBlock *ScalarPHVPBB = Plan.getScalarPreheader();
- VPBuilder ScalarPHBuilder(ScalarPHVPBB, ScalarPHVPBB->begin());
- bool HasCanonical = false;
- for (VPRecipeBase &R : *Plan.getScalarHeader()) {
- auto *PhiR = cast<VPIRInstruction>(&R);
- auto *Phi = dyn_cast<PHINode>(&PhiR->getInstruction());
- if (!Phi)
- break;
- if (!Legal->getInductionVars().contains(Phi) ||
- (IVSubset && !IVSubset->contains(Phi)))
- continue;
- const InductionDescriptor &II = Legal->getInductionVars().find(Phi)->second;
- createInductionResumeVPValue(PhiR, II, getExpandedStep(II, ExpandedSCEVs),
- LoopBypassBlocks, ScalarPHBuilder,
- MainVectorTripCount);
- auto *ConstStart = dyn_cast<ConstantInt>(II.getStartValue());
- auto *ConstStep = II.getConstIntStepValue();
- if (Phi->getType() == VectorTripCount->getType() && ConstStart &&
- ConstStart->isZero() && ConstStep && ConstStep->isOne())
- HasCanonical = true;
- }
-
- if (!IVSubset || HasCanonical)
- return;
- // When vectorizing the epilogue, create a resume phi for the canonical IV if
- // no suitable resume phi was already created.
- ScalarPHBuilder.createNaryOp(
- VPInstruction::ResumePhi,
- {&Plan.getVectorTripCount(),
- Plan.getOrAddLiveIn(ConstantInt::get(VectorTripCount->getType(), 0))},
- {}, "vec.epilog.resume.val");
+void InnerLoopVectorizer::createInductionAdditionalBypassValues(
+ const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
+ assert(MainVectorTripCount && "Must have bypass information");
+
+ Instruction *OldInduction = Legal->getPrimaryInduction();
+ IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
+ getAdditionalBypassBlock()->getFirstInsertionPt());
+ for (const auto &InductionEntry : Legal->getInductionVars()) {
+ PHINode *OrigPhi = InductionEntry.first;
+ const InductionDescriptor &II = InductionEntry.second;
+ Value *Step = getExpandedStep(II, ExpandedSCEVs);
+ // For the primary induction the additional bypass end value is known.
+ // Otherwise it is computed.
+ Value *EndValueFromAdditionalBypass = MainVectorTripCount;
+ if (OrigPhi != OldInduction) {
+ auto *BinOp = II.getInductionBinOp();
+ // Fast-math-flags propagate from the original induction instruction.
+ if (isa_and_nonnull<FPMathOperator>(BinOp))
+ BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
+
+ // Compute the end value for the additional bypass.
+ EndValueFromAdditionalBypass =
+ emitTransformedIndex(BypassBuilder, MainVectorTripCount,
+ II.getStartValue(), Step, II.getKind(), BinOp);
+ EndValueFromAdditionalBypass->setName("ind.end");
+ }
+
+ // Store the bypass value here, as it needs to be added as operand to its
+ // scalar preheader phi node after the epilogue skeleton has been created.
+ // TODO: Directly add as extra operand to the VPResumePHI recipe.
+ assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
+ "entry for OrigPhi already exits");
+ Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
+ }
}
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
@@ -2832,9 +2772,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
// faster.
emitMemRuntimeChecks(LoopScalarPreHeader);
- // Emit phis for the new starting index of the scalar loop.
- createInductionResumeVPValues(ExpandedSCEVs);
-
return LoopVectorPreHeader;
}
@@ -3048,22 +2985,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
PSE.getSE()->forgetLoop(OrigLoop);
PSE.getSE()->forgetBlockAndLoopDispositions();
- // When dealing with uncountable early exits we create middle.split blocks
- // between the vector loop region and the exit block. These blocks need
- // adding to any outer loop.
- VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
- Loop *OuterLoop = OrigLoop->getParentLoop();
- if (Legal->hasUncountableEarlyExit() && OuterLoop) {
- VPBasicBlock *MiddleVPBB = State.Plan->getMiddleBlock();
- VPBlockBase *PredVPBB = MiddleVPBB->getSinglePredecessor();
- while (PredVPBB && PredVPBB != VectorRegion) {
- BasicBlock *MiddleSplitBB =
- State.CFG.VPBB2IRBB[cast<VPBasicBlock>(PredVPBB)];
- OuterLoop->addBasicBlockToLoop(MiddleSplitBB, *LI);
- PredVPBB = PredVPBB->getSinglePredecessor();
- }
- }
-
// After vectorization, the exit blocks of the original loop will have
// additional predecessors. Invalidate SCEVs for the exit phis in case SE
// looked through single-entry phis.
@@ -3091,9 +3012,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
}
+ // Don't apply optimizations below when no vector region remains, as they all
+ // require a vector loop at the moment.
+ if (!State.Plan->getVectorLoopRegion())
+ return;
+
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
+ VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
@@ -3576,10 +3503,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (hasIrregularType(ScalarTy, DL))
return false;
- // For scalable vectors, the only interleave factor currently supported
- // must be power of 2 since we require the (de)interleave2 intrinsics
- // instead of shufflevectors.
- if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
+ // We currently only know how to emit interleave/deinterleave with
+ // Factor=2 for scalable vectors. This is purely an implementation
+ // limit.
+ if (VF.isScalable() && InterleaveFactor != 2)
return false;
// If the group involves a non-integral pointer, we may not be able to
@@ -4768,7 +4695,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
!isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
- LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
return ChosenFactor;
}
#endif
@@ -7697,6 +7623,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
"when vectorizing, the scalar cost must be computed.");
#endif
+ LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
return BestFactor;
}
@@ -7802,7 +7729,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
- &BestVPlan, Legal->getWidestInductionType());
+ &BestVPlan, OrigLoop->getParentLoop(),
+ Legal->getWidestInductionType());
#ifdef EXPENSIVE_CHECKS
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
@@ -7810,11 +7738,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 0. Generate SCEV-dependent code in the entry, including TripCount, before
// making any changes to the CFG.
- if (!BestVPlan.getEntry()->empty()) {
- State.CFG.PrevBB = OrigLoop->getLoopPreheader();
- State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
+ if (!BestVPlan.getEntry()->empty())
BestVPlan.getEntry()->execute(&State);
- }
+
if (!ILV.getTripCount())
ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
else
@@ -7823,6 +7749,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
+ VPBasicBlock *VectorPH =
+ cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
if (VectorizingEpilogue)
@@ -7860,19 +7788,20 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.prepareToExecute(
ILV.getTripCount(),
ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
+ replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
BestVPlan.execute(&State);
- auto *ExitVPBB = BestVPlan.getMiddleBlock();
+ auto *MiddleVPBB = BestVPlan.getMiddleBlock();
// 2.5 When vectorizing the epilogue, fix reduction and induction resume
// values from the additional bypass block.
if (VectorizingEpilogue) {
assert(!ILV.Legal->hasUncountableEarlyExit() &&
"Epilogue vectorisation not yet supported with early exits");
BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
- for (VPRecipeBase &R : *ExitVPBB) {
+ for (VPRecipeBase &R : *MiddleVPBB) {
fixReductionScalarResumeWhenVectorizingEpilog(
- &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock);
+ &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
}
BasicBlock *PH = OrigLoop->getLoopPreheader();
for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
@@ -7885,30 +7814,31 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
- MDNode *OrigLoopID = OrigLoop->getLoopID();
+ if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
- std::optional<MDNode *> VectorizedLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
- LLVMLoopVectorizeFollowupVectorized});
-
- VPBasicBlock *HeaderVPBB =
- BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
- Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
- if (VectorizedLoopID)
- L->setLoopID(*VectorizedLoopID);
- else {
- // Keep all loop hints from the original loop on the vector loop (we'll
- // replace the vectorizer-specific hints below).
- if (MDNode *LID = OrigLoop->getLoopID())
- L->setLoopID(LID);
-
- LoopVectorizeHints Hints(L, true, *ORE);
- Hints.setAlreadyVectorized();
+ std::optional<MDNode *> VectorizedLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupVectorized});
+
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+ Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
+ if (VectorizedLoopID) {
+ L->setLoopID(*VectorizedLoopID);
+ } else {
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ if (MDNode *LID = OrigLoop->getLoopID())
+ L->setLoopID(LID);
+
+ LoopVectorizeHints Hints(L, true, *ORE);
+ Hints.setAlreadyVectorized();
+ }
+ TargetTransformInfo::UnrollingPreferences UP;
+ TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
+ if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
+ addRuntimeUnrollDisableMetaData(L);
}
- TargetTransformInfo::UnrollingPreferences UP;
- TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
- if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
- addRuntimeUnrollDisableMetaData(L);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
@@ -7917,15 +7847,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();
// 4. Adjust branch weight of the branch in the middle block.
- auto *MiddleTerm =
- cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
- if (MiddleTerm->isConditional() &&
- hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
- // Assume that `Count % VectorTripCount` is equally distributed.
- unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
- assert(TripCount > 0 && "trip count should not be zero");
- const uint32_t Weights[] = {1, TripCount - 1};
- setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+ if (BestVPlan.getVectorLoopRegion()) {
+ auto *MiddleVPBB = BestVPlan.getMiddleBlock();
+ auto *MiddleTerm =
+ cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
+ if (MiddleTerm->isConditional() &&
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+ // Assume that `Count % VectorTripCount` is equally distributed.
+ unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+ assert(TripCount > 0 && "trip count should not be zero");
+ const uint32_t Weights[] = {1, TripCount - 1};
+ setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+ }
}
return State.ExpandedSCEVs;
@@ -7968,17 +7901,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
// Generate the induction variable.
EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
- // Generate VPValues and ResumePhi recipes for wide inductions in the epilogue
- // plan only. Other inductions only need a resume value for the canonical
- // induction, which will get created during epilogue skeleton construction.
- SmallPtrSet<PHINode *, 4> WideIVs;
- for (VPRecipeBase &H :
- EPI.EpiloguePlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- if (auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&H))
- WideIVs.insert(WideIV->getPHINode());
- }
- createInductionResumeVPValues(ExpandedSCEVs, nullptr, &WideIVs);
-
return LoopVectorPreHeader;
}
@@ -8048,7 +7970,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
- introduceCheckBlockInVPlan(Plan, TCCheckBlock);
+ introduceCheckBlockInVPlan(TCCheckBlock);
return TCCheckBlock;
}
@@ -8128,14 +8050,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
Phi->removeIncomingValue(EPI.MemSafetyCheck);
}
- // Generate induction resume values. These variables save the new starting
- // indexes for the scalar loop. They are used to test if there are any tail
- // iterations left once the vector loop has completed.
- // Note that when the vectorized epilogue is skipped due to iteration count
- // check, then the resume value for the induction variable comes from
- // the trip count of the main vector loop, passed as the second argument.
- createInductionResumeVPValues(ExpandedSCEVs, EPI.VectorTripCount);
-
+ // Generate bypass values from the additional bypass block. Note that when the
+ // vectorized epilogue is skipped due to iteration count check, then the
+ // resume value for the induction variable comes from the trip count of the
+ // main vector loop, passed as the second argument.
+ createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
return LoopVectorPreHeader;
}
@@ -8185,13 +8104,13 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
// A new entry block has been created for the epilogue VPlan. Hook it in, as
// otherwise we would try to modify the entry to the main vector loop.
- VPIRBasicBlock *NewEntry = VPIRBasicBlock::fromBasicBlock(Insert);
+ VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
VPBasicBlock *OldEntry = Plan.getEntry();
VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
Plan.setEntry(NewEntry);
- delete OldEntry;
+ // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
- introduceCheckBlockInVPlan(Plan, Insert);
+ introduceCheckBlockInVPlan(Insert);
return Insert;
}
@@ -8435,17 +8354,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
- if (Reverse)
+ if (Reverse) {
+ // When folding the tail, we may compute an address that we don't in the
+ // original scalar loop and it may not be inbounds. Drop Inbounds in that
+ // case.
+ GEPNoWrapFlags Flags =
+ (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
+ ? GEPNoWrapFlags::none()
+ : GEPNoWrapFlags::inBounds();
VectorPtr = new VPReverseVectorPointerRecipe(
- Ptr, &Plan.getVF(), getLoadStoreType(I),
- GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
- : GEPNoWrapFlags::none(),
- I->getDebugLoc());
- else
+ Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
+ } else {
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
I->getDebugLoc());
+ }
Builder.getInsertBlock()->appendRecipe(VectorPtr);
Ptr = VectorPtr;
}
@@ -8955,14 +8879,56 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
}
-/// Create resume phis in the scalar preheader for first-order recurrences and
-/// reductions and update the VPIRInstructions wrapping the original phis in the
-/// scalar header.
+/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
+/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
+/// the end value of the induction.
+static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
+ VPBuilder &VectorPHBuilder,
+ VPBuilder &ScalarPHBuilder,
+ VPTypeAnalysis &TypeInfo,
+ VPValue *VectorTC) {
+ auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+ // Truncated wide inductions resume from the last lane of their vector value
+ // in the last vector iteration which is handled elsewhere.
+ if (WideIntOrFp && WideIntOrFp->getTruncInst())
+ return nullptr;
+
+ VPValue *Start = WideIV->getStartValue();
+ VPValue *Step = WideIV->getStepValue();
+ const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+ VPValue *EndValue = VectorTC;
+ if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+ EndValue = VectorPHBuilder.createDerivedIV(
+ ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
+ Start, VectorTC, Step);
+ }
+
+ // EndValue is derived from the vector trip count (which has the same type as
+ // the widest induction) and thus may be wider than the induction here.
+ Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
+ if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
+ EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
+ ScalarTypeOfWideIV,
+ WideIV->getDebugLoc());
+ }
+
+ auto *ResumePhiRecipe =
+ ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
+ WideIV->getDebugLoc(), "bc.resume.val");
+ return ResumePhiRecipe;
+}
+
+/// Create resume phis in the scalar preheader for first-order recurrences,
+/// reductions and inductions, and update the VPIRInstructions wrapping the
+/// original phis in the scalar header.
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
- VPBuilder ScalarPHBuilder(ScalarPH);
+ VPBuilder VectorPHBuilder(
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+ VPBuilder ScalarPHBuilder(ScalarPH);
VPValue *OneVPV = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
@@ -8970,9 +8936,23 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
if (!ScalarPhiI)
break;
+
auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
- if (!isa<VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe>(VectorPhiR))
+ if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
+ if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
+ WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
+ &Plan.getVectorTripCount())) {
+ ScalarPhiIRI->addOperand(ResumePhi);
+ continue;
+ }
+ // TODO: Also handle truncated inductions here. Computing end-values
+ // separately should be done as VPlan-to-VPlan optimization, after
+ // legalizing all resume values to use the last lane from the loop.
+ assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
+ "should only skip truncated wide inductions");
continue;
+ }
+
// The backedge value provides the value to resume coming out of a loop,
// which for FORs is a vector whose last element needs to be extracted. The
// start value provides the value if the loop is bypassed.
@@ -8990,14 +8970,73 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
}
}
+/// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
+/// either an untruncated wide induction, or if it increments a wide induction
+/// by its step.
+static bool isOptimizableIVOrUse(VPValue *VPV) {
+ VPRecipeBase *Def = VPV->getDefiningRecipe();
+ if (!Def)
+ return false;
+ auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
+ if (WideIV) {
+ // VPV itself is a wide induction, separately compute the end value for exit
+ // users if it is not a truncated IV.
+ return isa<VPWidenPointerInductionRecipe>(WideIV) ||
+ !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
+ }
+
+ // Check if VPV is an optimizable induction increment.
+ if (Def->getNumOperands() != 2)
+ return false;
+ WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
+ if (!WideIV)
+ WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
+ if (!WideIV)
+ return false;
+
+ using namespace VPlanPatternMatch;
+ auto &ID = WideIV->getInductionDescriptor();
+
+ // Check if VPV increments the induction by the induction step.
+ VPValue *IVStep = WideIV->getStepValue();
+ switch (ID.getInductionOpcode()) {
+ case Instruction::Add:
+ return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
+ m_Specific(IVStep)));
+ case Instruction::FAdd:
+ return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
+ m_Specific(IVStep)));
+ case Instruction::FSub:
+ return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
+ m_Specific(IVStep)));
+ case Instruction::Sub: {
+ // IVStep will be the negated step of the subtraction. Check if Step == -1 *
+ // IVStep.
+ VPValue *Step;
+ if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
+ !Step->isLiveIn() || !IVStep->isLiveIn())
+ return false;
+ auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
+ auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
+ return StepCI && IVStepCI &&
+ StepCI->getValue() == (-1 * IVStepCI->getValue());
+ }
+ default:
+ return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+ match(VPV, m_GetElementPtr(m_Specific(WideIV),
+ m_Specific(WideIV->getStepValue())));
+ }
+ llvm_unreachable("should have been covered by switch above");
+}
+
// Collect VPIRInstructions for phis in the exit blocks that are modeled
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
// modeled explicitly yet and won't be included. Those are un-truncated
// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
// increments.
-static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
- Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+static SetVector<VPIRInstruction *>
+collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
+ VPlan &Plan) {
auto *MiddleVPBB = Plan.getMiddleBlock();
SetVector<VPIRInstruction *> ExitUsersToFix;
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
@@ -9022,18 +9061,9 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
// Exit values for inductions are computed and updated outside of VPlan
// and independent of induction recipes.
// TODO: Compute induction exit values in VPlan.
- if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
- !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
- isa<VPWidenPointerInductionRecipe>(V) ||
- (isa<Instruction>(IncomingValue) &&
- OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
- any_of(IncomingValue->users(), [&Inductions](User *U) {
- auto *P = dyn_cast<PHINode>(U);
- return P && Inductions.contains(P);
- }))) {
- if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
- continue;
- }
+ if (isOptimizableIVOrUse(V) &&
+ ExitVPBB->getSinglePredecessor() == MiddleVPBB)
+ continue;
ExitUsersToFix.insert(ExitIRI);
ExitIRI->addOperand(V);
}
@@ -9239,9 +9269,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
// For scalable vectors, the only interleave factor currently supported
- // must be power of 2 since we require the (de)interleave2 intrinsics
- // instead of shufflevectors.
- assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
+ // is 2 since we require the (de)interleave2 intrinsics instead of
+ // shufflevectors.
+ assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
"Unsupported interleave factor for scalable vectors");
return Result;
};
@@ -9335,7 +9365,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->appendRecipe(Recipe);
}
- VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+ VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
@@ -9348,14 +9378,28 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
+ // Update wide induction increments to use the same step as the corresponding
+ // wide induction. This enables detecting induction increments directly in
+ // VPlan and removes redundant splats.
+ for (const auto &[Phi, ID] : Legal->getInductionVars()) {
+ auto *IVInc = cast<Instruction>(
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+ if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
+ continue;
+ VPWidenInductionRecipe *WideIV =
+ cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
+ VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
+ R->setOperand(1, WideIV->getStepValue());
+ }
+
if (auto *UncountableExitingBlock =
Legal->getUncountableEarlyExitingBlock()) {
VPlanTransforms::handleUncountableEarlyExit(
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
}
addScalarResumePhis(RecipeBuilder, *Plan);
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+ SetVector<VPIRInstruction *> ExitUsersToFix =
+ collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
reportVectorizationFailure(
@@ -9474,6 +9518,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
bool HasNUW = true;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
DebugLoc());
+
+ // Collect mapping of IR header phis to header phi recipes, to be used in
+ // addScalarResumePhis.
+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+ for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ if (isa<VPCanonicalIVPHIRecipe>(&R))
+ continue;
+ auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
+ RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
+ }
+ addScalarResumePhis(RecipeBuilder, *Plan);
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
@@ -9762,13 +9818,18 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
Value *Step = State.get(getStepValue(), VPLane(0));
- Value *CanonicalIV = State.get(getOperand(1), VPLane(0));
+ Value *Index = State.get(getOperand(1), VPLane(0));
Value *DerivedIV = emitTransformedIndex(
- State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
- Kind, cast_if_present<BinaryOperator>(FPBinOp));
+ State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
+ cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName(Name);
- assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
-
+ // If index is the vector trip count, the concrete value will only be set in
+ // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
+ // TODO: Remove the special case for the vector trip count once it is computed
+ // in VPlan and can be used during VPlan simplification.
+ assert((DerivedIV != Index ||
+ getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
+ "IV didn't need transforming?");
State.set(this, DerivedIV, VPLane(0));
}
@@ -10078,6 +10139,57 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
!EnableLoopVectorization) {}
+/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
+/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
+/// don't have a corresponding wide induction in \p EpiPlan.
+static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
+ // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
+ // will need their resume-values computed in the main vector loop. Others
+ // can be removed from the main VPlan.
+ SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
+ for (VPRecipeBase &R :
+ EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ if (isa<VPCanonicalIVPHIRecipe>(&R))
+ continue;
+ EpiWidenedPhis.insert(
+ cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
+ }
+ for (VPRecipeBase &R : make_early_inc_range(
+ *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
+ auto *VPIRInst = cast<VPIRInstruction>(&R);
+ auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
+ if (!IRI)
+ break;
+ if (EpiWidenedPhis.contains(IRI))
+ continue;
+ // There is no corresponding wide induction in the epilogue plan that would
+ // need a resume value. Remove the VPIRInst wrapping the scalar header phi
+ // together with the corresponding ResumePhi. The resume values for the
+ // scalar loop will be created during execution of EpiPlan.
+ VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
+ VPIRInst->eraseFromParent();
+ ResumePhi->eraseFromParent();
+ }
+ VPlanTransforms::removeDeadRecipes(MainPlan);
+
+ using namespace VPlanPatternMatch;
+ VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
+ VPValue *VectorTC = &MainPlan.getVectorTripCount();
+ // If there is a suitable resume value for the canonical induction in the
+ // scalar (which will become vector) epilogue loop we are done. Otherwise
+ // create it below.
+ if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
+ return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
+ m_Specific(VectorTC), m_SpecificInt(0)));
+ }))
+ return;
+ VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
+ ScalarPHBuilder.createNaryOp(
+ VPInstruction::ResumePhi,
+ {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
+ "vec.epilog.resume.val");
+}
+
/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
static void
@@ -10542,12 +10654,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+ preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
BestEpiPlan);
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI, &LVL, &CM, BFI, PSI, Checks,
*BestMainPlan);
-
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, false);
++LoopsVectorized;