diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 439 |
1 files changed, 138 insertions, 301 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 93a5f22bd097..febdc54e666a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1762,9 +1762,10 @@ public: GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, const DataLayout &DL, TTI::TargetCostKind CostKind) - : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), - MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE), - CostKind(CostKind) {} + : DT(DT), LI(LI), TTI(TTI), + SCEVExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false), + MemCheckExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false), + PSE(PSE), CostKind(CostKind) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -2438,8 +2439,9 @@ struct CSEDenseMapInfo { } // end anonymous namespace -///Perform cse of induction variable instructions. -static void cse(BasicBlock *BB) { +/// FIXME: This legacy common-subexpression-elimination routine is scheduled for +/// removal, in favor of the VPlan-based one. +static void legacyCSE(BasicBlock *BB) { // Perform simple cse. SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; for (Instruction &In : llvm::make_early_inc_range(*BB)) { @@ -2543,7 +2545,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; // Remove redundant induction instructions. - cse(HeaderBB); + legacyCSE(HeaderBB); } void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { @@ -3901,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4158,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, + *CM.PSE.getSE()); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -4260,8 +4264,8 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { if (!Legal->isReductionVariable(&Phi)) return Legal->isFixedOrderRecurrence(&Phi); - RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); - return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum; + return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind()); })) return false; @@ -5692,11 +5696,25 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { Instruction *I = Worklist.pop_back_val(); for (auto &Op : I->operands()) if (auto *InstOp = dyn_cast<Instruction>(Op)) - if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && + if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) && AddrDefs.insert(InstOp).second) Worklist.push_back(InstOp); } + auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) { + // If there are direct memory op users of the newly scalarized load, + // their cost may have changed because there's no scalarization + // overhead for the operand. Update it. + for (User *U : LI->users()) { + if (!isa<LoadInst, StoreInst>(U)) + continue; + if (getWideningDecision(cast<Instruction>(U), VF) != CM_Scalarize) + continue; + setWideningDecision( + cast<Instruction>(U), VF, CM_Scalarize, + getMemInstScalarizationCost(cast<Instruction>(U), VF)); + } + }; for (auto *I : AddrDefs) { if (isa<LoadInst>(I)) { // Setting the desired widening decision should ideally be handled in @@ -5706,21 +5724,24 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { InstWidening Decision = getWideningDecision(I, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse || (!isPredicatedInst(I) && !Legal->isUniformMemOp(*I, VF) && - Decision == CM_Scalarize)) + Decision == CM_Scalarize)) { // Scalarize a widened load of address or update the cost of a scalar // load of an address. setWideningDecision( I, VF, CM_Scalarize, (VF.getKnownMinValue() * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); - else if (const auto *Group = getInterleavedAccessGroup(I)) { + UpdateMemOpUserCost(cast<LoadInst>(I)); + } else if (const auto *Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { - if (Instruction *Member = Group->getMember(I)) + if (Instruction *Member = Group->getMember(I)) { setWideningDecision( Member, VF, CM_Scalarize, (VF.getKnownMinValue() * getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); + UpdateMemOpUserCost(cast<LoadInst>(Member)); + } } } } else { @@ -6833,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7066,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -7170,7 +7192,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF); - VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan); + VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks, + BestVPlan); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); bool HasBranchWeights = @@ -7260,8 +7283,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( if (!Exit->hasPredecessors()) continue; for (VPRecipeBase &PhiR : Exit->phis()) - SE.forgetLcssaPhiWithNewPredecessor( - OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction())); + SE.forgetLcssaPhiWithNewPredecessor(OrigLoop, + &cast<VPIRPhi>(PhiR).getIRPhi()); } // Forget the original loop and block dispositions. SE.forgetLoop(OrigLoop); @@ -7482,12 +7505,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, VPSingleDefRecipe *VectorPtr; if (Reverse) { // When folding the tail, we may compute an address that we don't in the - // original scalar loop and it may not be inbounds. Drop Inbounds in that - // case. + // original scalar loop: drop the GEP no-wrap flags in this case. + // Otherwise preserve existing flags without no-unsigned-wrap, as we will + // emit negative indices. GEPNoWrapFlags Flags = - (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) + CM.foldTailByMasking() || !GEP ? GEPNoWrapFlags::none() - : GEPNoWrapFlags::inBounds(); + : GEP->getNoWrapFlags().withoutNoUnsignedWrap(); VectorPtr = new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), /*Stride*/ -1, Flags, I->getDebugLoc()); @@ -7934,6 +7958,13 @@ bool VPRecipeBuilder::getScaledReductions( auto CollectExtInfo = [this, &Exts, &ExtOpTypes, &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool { for (const auto &[I, OpI] : enumerate(Ops)) { + const APInt *C; + if (I > 0 && match(OpI, m_APInt(C)) && + canConstantBeExtended(C, ExtOpTypes[0], ExtKinds[0])) { + ExtOpTypes[I] = ExtOpTypes[0]; + ExtKinds[I] = ExtKinds[0]; + continue; + } Value *ExtOp; if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp)))) return false; @@ -8159,14 +8190,12 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes( std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) { - bool HasScalarVF = Plan->hasScalarVFOnly(); // Now optimize the initial VPlan. - if (!HasScalarVF) - VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths, - *Plan, CM.getMinimalBitwidths()); + VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths, + *Plan, CM.getMinimalBitwidths()); VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan); // TODO: try to put it close to addActiveLaneMask(). - if (CM.foldTailWithEVL() && !HasScalarVF) + if (CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, *Plan, CM.getMaxSafeElements()); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); @@ -8176,211 +8205,6 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } -/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the -/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute -/// the end value of the induction. -static VPInstruction *addResumePhiRecipeForInduction( - VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, - VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) { - auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV); - // Truncated wide inductions resume from the last lane of their vector value - // in the last vector iteration which is handled elsewhere. - if (WideIntOrFp && WideIntOrFp->getTruncInst()) - return nullptr; - - VPValue *Start = WideIV->getStartValue(); - VPValue *Step = WideIV->getStepValue(); - const InductionDescriptor &ID = WideIV->getInductionDescriptor(); - VPValue *EndValue = VectorTC; - if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { - EndValue = VectorPHBuilder.createDerivedIV( - ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), - Start, VectorTC, Step); - } - - // EndValue is derived from the vector trip count (which has the same type as - // the widest induction) and thus may be wider than the induction here. - Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV); - if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) { - EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue, - ScalarTypeOfWideIV, - WideIV->getDebugLoc()); - } - - auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi( - {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val"); - return ResumePhiRecipe; -} - -/// Create resume phis in the scalar preheader for first-order recurrences, -/// reductions and inductions, and update the VPIRInstructions wrapping the -/// original phis in the scalar header. End values for inductions are added to -/// \p IVEndValues. -static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, - DenseMap<VPValue *, VPValue *> &IVEndValues) { - VPTypeAnalysis TypeInfo(Plan); - auto *ScalarPH = Plan.getScalarPreheader(); - auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]); - VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); - VPBuilder VectorPHBuilder( - cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())); - VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); - VPBuilder ScalarPHBuilder(ScalarPH); - for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) { - auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR); - - // TODO: Extract final value from induction recipe initially, optimize to - // pre-computed end value together in optimizeInductionExitUsers. - auto *VectorPhiR = - cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi())); - if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { - if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction( - WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, - &Plan.getVectorTripCount())) { - assert(isa<VPPhi>(ResumePhi) && "Expected a phi"); - IVEndValues[WideIVR] = ResumePhi->getOperand(0); - ScalarPhiIRI->addOperand(ResumePhi); - continue; - } - // TODO: Also handle truncated inductions here. Computing end-values - // separately should be done as VPlan-to-VPlan optimization, after - // legalizing all resume values to use the last lane from the loop. - assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() && - "should only skip truncated wide inductions"); - continue; - } - - // The backedge value provides the value to resume coming out of a loop, - // which for FORs is a vector whose last element needs to be extracted. The - // start value provides the value if the loop is bypassed. - bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR); - auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); - assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && - "Cannot handle loops with uncountable early exits"); - if (IsFOR) - ResumeFromVectorLoop = MiddleBuilder.createNaryOp( - VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {}, - "vector.recur.extract"); - StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; - auto *ResumePhiR = ScalarPHBuilder.createScalarPhi( - {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); - ScalarPhiIRI->addOperand(ResumePhiR); - } -} - -/// Handle users in the exit block for first order reductions in the original -/// exit block. The penultimate value of recurrences is fed to their LCSSA phi -/// users in the original exit block using the VPIRInstruction wrapping to the -/// LCSSA phi. -static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) { - VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); - auto *ScalarPHVPBB = Plan.getScalarPreheader(); - auto *MiddleVPBB = Plan.getMiddleBlock(); - VPBuilder ScalarPHBuilder(ScalarPHVPBB); - VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); - - auto IsScalableOne = [](ElementCount VF) -> bool { - return VF == ElementCount::getScalable(1); - }; - - for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { - auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi); - if (!FOR) - continue; - - assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && - "Cannot handle loops with uncountable early exits"); - - // This is the second phase of vectorizing first-order recurrences, creating - // extract for users outside the loop. An overview of the transformation is - // described below. Suppose we have the following loop with some use after - // the loop of the last a[i-1], - // - // for (int i = 0; i < n; ++i) { - // t = a[i - 1]; - // b[i] = a[i] - t; - // } - // use t; - // - // There is a first-order recurrence on "a". For this loop, the shorthand - // scalar IR looks like: - // - // scalar.ph: - // s.init = a[-1] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s.init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // use = lcssa.phi [s1, scalar.body] - // - // In this example, s1 is a recurrence because it's value depends on the - // previous iteration. In the first phase of vectorization, we created a - // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts - // for users in the scalar preheader and exit block. - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3] - // b[i] = v2 - v1 - // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) - // b[i, i+1, i+2, i+3] = v2 - v1 - // br cond, vector.body, middle.block - // - // middle.block: - // vector.recur.extract.for.phi = v2(2) - // vector.recur.extract = v2(3) - // br cond, scalar.ph, exit.block - // - // scalar.ph: - // scalar.recur.init = phi [vector.recur.extract, middle.block], - // [s.init, otherwise] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // lo = lcssa.phi [s1, scalar.body], - // [vector.recur.extract.for.phi, middle.block] - // - // Now update VPIRInstructions modeling LCSSA phis in the exit block. - // Extract the penultimate value of the recurrence and use it as operand for - // the VPIRInstruction modeling the phi. - for (VPUser *U : FOR->users()) { - using namespace llvm::VPlanPatternMatch; - if (!match(U, m_ExtractLastElement(m_Specific(FOR)))) - continue; - // For VF vscale x 1, if vscale = 1, we are unable to extract the - // penultimate value of the recurrence. Instead we rely on the existing - // extract of the last element from the result of - // VPInstruction::FirstOrderRecurrenceSplice. - // TODO: Consider vscale_range info and UF. - if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne, - Range)) - return; - VPValue *PenultimateElement = MiddleBuilder.createNaryOp( - VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()}, - {}, "vector.recur.extract.for.phi"); - cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement); - } - } -} - VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) { @@ -8417,14 +8241,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // the vector loop or when not folding the tail. In the later case, we know // that the canonical induction increment will not overflow as the vector trip // count is >= increment and a multiple of the increment. + VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; if (!HasNUW) { - auto *IVInc = Plan->getVectorLoopRegion() - ->getExitingBasicBlock() - ->getTerminator() - ->getOperand(0); - assert(match(IVInc, m_VPInstruction<Instruction::Add>( - m_Specific(Plan->getCanonicalIV()), m_VPValue())) && + auto *IVInc = + LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0); + assert(match(IVInc, + m_VPInstruction<Instruction::Add>( + m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) && "Did not find the canonical IV increment"); cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags(); } @@ -8470,7 +8294,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. - VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( HeaderVPBB); @@ -8554,8 +8377,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( for (VPValue *Old : Old2New.keys()) Old->getDefiningRecipe()->eraseFromParent(); - assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && - !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && + assert(isa<VPRegionBlock>(LoopRegion) && + !LoopRegion->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); @@ -8573,9 +8396,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( R->setOperand(1, WideIV->getStepValue()); } - addExitUsersForFirstOrderRecurrences(*Plan, Range); + // TODO: We can't call runPass on these transforms yet, due to verifier + // failures. + VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range); DenseMap<VPValue *, VPValue *> IVEndValues; - addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); + VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to @@ -8596,7 +8421,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -8665,7 +8491,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { Plan->addVF(VF); if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( - Plan, + *Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, @@ -8686,7 +8512,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { DenseMap<VPValue *, VPValue *> IVEndValues; // TODO: IVEndValues are not used yet in the native path, to optimize exit // values. - addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); + // TODO: We can't call runPass on the transform yet, due to verifier + // failures. + VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; @@ -8946,13 +8774,19 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) && "Unexpected truncated min-max recurrence!"); Type *RdxTy = RdxDesc.getRecurrenceType(); - auto *Trunc = - new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); + VPWidenCastRecipe *Trunc; Instruction::CastOps ExtendOpc = RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt; - auto *Extnd = new VPWidenCastRecipe(ExtendOpc, Trunc, PhiTy); - Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); - Extnd->insertAfter(Trunc); + VPWidenCastRecipe *Extnd; + { + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint( + NewExitingVPV->getDefiningRecipe()->getParent(), + std::next(NewExitingVPV->getDefiningRecipe()->getIterator())); + Trunc = + Builder.createWidenCast(Instruction::Trunc, NewExitingVPV, RdxTy); + Extnd = Builder.createWidenCast(ExtendOpc, Trunc, PhiTy); + } if (PhiR->getOperand(1) == NewExitingVPV) PhiR->setOperand(1, Extnd->getVPSingleValue()); @@ -9492,8 +9326,9 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { if (ResumePhiIter == MainScalarPH->phis().end()) { VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin()); ResumePhi = ScalarPHBuilder.createScalarPhi( - {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {}, - "vec.epilog.resume.val"); + {VectorTC, + MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue()}, + {}, "vec.epilog.resume.val"); } else { ResumePhi = cast<VPPhi>(&*ResumePhiIter); if (MainScalarPH->begin() == MainScalarPH->end()) @@ -9520,55 +9355,57 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop( VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); + VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV(); + // When vectorizing the epilogue loop, the canonical induction needs to be + // adjusted by the value after the main vector loop. Find the resume value + // created during execution of the main VPlan. It must be the first phi in the + // loop preheader. Use the value to increment the canonical IV, and update all + // users in the loop region to use the adjusted value. + // FIXME: Improve modeling for canonical IV start values in the epilogue + // loop. + using namespace llvm::PatternMatch; + PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin(); + for (Value *Inc : EPResumeVal->incoming_values()) { + if (match(Inc, m_SpecificInt(0))) + continue; + assert(!EPI.VectorTripCount && + "Must only have a single non-zero incoming value"); + EPI.VectorTripCount = Inc; + } + // If we didn't find a non-zero vector trip count, all incoming values + // must be zero, which also means the vector trip count is zero. Pick the + // first zero as vector trip count. + // TODO: We should not choose VF * UF so the main vector loop is known to + // be dead. + if (!EPI.VectorTripCount) { + assert(EPResumeVal->getNumIncomingValues() > 0 && + all_of(EPResumeVal->incoming_values(), + [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) && + "all incoming values must be 0"); + EPI.VectorTripCount = EPResumeVal->getOperand(0); + } + VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); + assert(all_of(IV->users(), + [](const VPUser *U) { + return isa<VPScalarIVStepsRecipe>(U) || + isa<VPDerivedIVRecipe>(U) || + cast<VPRecipeBase>(U)->isScalarCast() || + cast<VPInstruction>(U)->getOpcode() == + Instruction::Add; + }) && + "the canonical IV should only be used by its increment or " + "ScalarIVSteps when resetting the start value"); + VPBuilder Builder(Header, Header->getFirstNonPhi()); + VPInstruction *Add = Builder.createNaryOp(Instruction::Add, {IV, VPV}); + IV->replaceAllUsesWith(Add); + Add->setOperand(0, IV); + DenseMap<Value *, Value *> ToFrozen; SmallVector<Instruction *> InstsToMove; // Ensure that the start values for all header phi recipes are updated before - // vectorizing the epilogue loop. - for (VPRecipeBase &R : Header->phis()) { - if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) { - // When vectorizing the epilogue loop, the canonical induction start - // value needs to be changed from zero to the value after the main - // vector loop. Find the resume value created during execution of the main - // VPlan. It must be the first phi in the loop preheader. - // FIXME: Improve modeling for canonical IV start values in the epilogue - // loop. - using namespace llvm::PatternMatch; - PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin(); - for (Value *Inc : EPResumeVal->incoming_values()) { - if (match(Inc, m_SpecificInt(0))) - continue; - assert(!EPI.VectorTripCount && - "Must only have a single non-zero incoming value"); - EPI.VectorTripCount = Inc; - } - // If we didn't find a non-zero vector trip count, all incoming values - // must be zero, which also means the vector trip count is zero. Pick the - // first zero as vector trip count. - // TODO: We should not choose VF * UF so the main vector loop is known to - // be dead. - if (!EPI.VectorTripCount) { - assert( - EPResumeVal->getNumIncomingValues() > 0 && - all_of(EPResumeVal->incoming_values(), - [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) && - "all incoming values must be 0"); - EPI.VectorTripCount = EPResumeVal->getOperand(0); - } - VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); - assert(all_of(IV->users(), - [](const VPUser *U) { - return isa<VPScalarIVStepsRecipe>(U) || - isa<VPDerivedIVRecipe>(U) || - cast<VPRecipeBase>(U)->isScalarCast() || - cast<VPInstruction>(U)->getOpcode() == - Instruction::Add; - }) && - "the canonical IV should only be used by its increment or " - "ScalarIVSteps when resetting the start value"); - IV->setOperand(0, VPV); - continue; - } - + // vectorizing the epilogue loop. Skip the canonical IV, which has been + // handled above. + for (VPRecipeBase &R : drop_begin(Header->phis())) { Value *ResumeV = nullptr; // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { @@ -10053,7 +9890,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind); + CM.CostKind, *CM.PSE.getSE()); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, |
