diff options
| author | Mingming Liu <mingmingl@google.com> | 2025-09-10 15:25:31 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-10 15:25:31 -0700 |
| commit | 1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch) | |
| tree | 57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/lib/Transforms/Vectorize | |
| parent | 898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff) | |
| parent | b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff) | |
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
20 files changed, 2304 insertions, 1033 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index 491f0b76f4ae..53129e2e5fbb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -170,10 +170,10 @@ private: bool recognizeFindFirstByte(); Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU, - unsigned VF, Type *CharTy, BasicBlock *ExitSucc, - BasicBlock *ExitFail, Value *SearchStart, - Value *SearchEnd, Value *NeedleStart, - Value *NeedleEnd); + unsigned VF, Type *CharTy, Value *IndPhi, + BasicBlock *ExitSucc, BasicBlock *ExitFail, + Value *SearchStart, Value *SearchEnd, + Value *NeedleStart, Value *NeedleEnd); void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc, BasicBlock *ExitFail, @@ -242,6 +242,37 @@ bool LoopIdiomVectorize::run(Loop *L) { return false; } +static void fixSuccessorPhis(Loop *L, Value *ScalarRes, Value *VectorRes, + BasicBlock *SuccBB, BasicBlock *IncBB) { + for (PHINode &PN : SuccBB->phis()) { + // Look through the incoming values to find ScalarRes, meaning this is a + // PHI collecting the results of the transformation. + bool ResPhi = false; + for (Value *Op : PN.incoming_values()) + if (Op == ScalarRes) { + ResPhi = true; + break; + } + + // Any PHI that depended upon the result of the transformation needs a new + // incoming value from IncBB. + if (ResPhi) + PN.addIncoming(VectorRes, IncBB); + else { + // There should be no other outside uses of other values in the + // original loop. Any incoming values should either: + // 1. Be for blocks outside the loop, which aren't interesting. Or .. + // 2. These are from blocks in the loop with values defined outside + // the loop. We should a similar incoming value from CmpBB. + for (BasicBlock *BB : PN.blocks()) + if (L->contains(BB)) { + PN.addIncoming(PN.getIncomingValueForBlock(BB), IncBB); + break; + } + } + } +} + bool LoopIdiomVectorize::recognizeByteCompare() { // Currently the transformation only works on scalable vector types, although // there is no fundamental reason why it cannot be made to work for fixed @@ -574,13 +605,8 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch( Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()}, {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load"); - StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE); - auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr); - Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS); - Value *VectorMatchCmp = Builder.CreateIntrinsic( - Intrinsic::vp_icmp, {VectorLhsLoad->getType()}, - {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr, - "mismatch.cmp"); + Value *VectorMatchCmp = + Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad, "mismatch.cmp"); Value *CTZ = Builder.CreateIntrinsic( Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()}, {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask, @@ -940,42 +966,10 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}}); } - auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { - for (PHINode &PN : SuccBB->phis()) { - // At this point we've already replaced all uses of the result from the - // loop with ByteCmp. Look through the incoming values to find ByteCmp, - // meaning this is a Phi collecting the results of the byte compare. - bool ResPhi = false; - for (Value *Op : PN.incoming_values()) - if (Op == ByteCmpRes) { - ResPhi = true; - break; - } - - // Any PHI that depended upon the result of the byte compare needs a new - // incoming value from CmpBB. This is because the original loop will get - // deleted. - if (ResPhi) - PN.addIncoming(ByteCmpRes, CmpBB); - else { - // There should be no other outside uses of other values in the - // original loop. Any incoming values should either: - // 1. Be for blocks outside the loop, which aren't interesting. Or .. - // 2. These are from blocks in the loop with values defined outside - // the loop. We should a similar incoming value from CmpBB. - for (BasicBlock *BB : PN.blocks()) - if (CurLoop->contains(BB)) { - PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB); - break; - } - } - } - }; - // Ensure all Phis in the successors of CmpBB have an incoming value from it. - fixSuccessorPhis(EndBB); + fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, EndBB, CmpBB); if (EndBB != FoundBB) - fixSuccessorPhis(FoundBB); + fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, FoundBB, CmpBB); // The new CmpBB block isn't part of the loop, but will need to be added to // the outer loop if there is one. @@ -1173,8 +1167,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() { Value *LoopIdiomVectorize::expandFindFirstByte( IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy, - BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart, - Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) { + Value *IndPhi, BasicBlock *ExitSucc, BasicBlock *ExitFail, + Value *SearchStart, Value *SearchEnd, Value *NeedleStart, + Value *NeedleEnd) { // Set up some types and constants that we intend to reuse. auto *PtrTy = Builder.getPtrTy(); auto *I64Ty = Builder.getInt64Ty(); @@ -1374,6 +1369,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte( MatchLCSSA->addIncoming(Search, BB2); MatchPredLCSSA->addIncoming(MatchPred, BB2); + // Ensure all Phis in the successors of BB3/BB5 have an incoming value from + // them. + fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitSucc, BB3); + if (ExitSucc != ExitFail) + fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitFail, BB5); + if (VerifyLoops) { OuterLoop->verifyLoop(); InnerLoop->verifyLoop(); @@ -1395,21 +1396,12 @@ void LoopIdiomVectorize::transformFindFirstByte( DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); - Value *MatchVal = - expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail, - SearchStart, SearchEnd, NeedleStart, NeedleEnd); + expandFindFirstByte(Builder, DTU, VF, CharTy, IndPhi, ExitSucc, ExitFail, + SearchStart, SearchEnd, NeedleStart, NeedleEnd); assert(PHBranch->isUnconditional() && "Expected preheader to terminate with an unconditional branch."); - // Add new incoming values with the result of the transformation to PHINodes - // of ExitSucc that use IndPhi. - for (auto *U : llvm::make_early_inc_range(IndPhi->users())) { - auto *PN = dyn_cast<PHINode>(U); - if (PN && PN->getParent() == ExitSucc) - PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent()); - } - if (VerifyLoops && CurLoop->getParentLoop()) { CurLoop->getParentLoop()->verifyLoop(); if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI)) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 789047a2a28e..2704e66f3a70 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -15,8 +15,10 @@ // #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() { }); } - if (!LAI->canVectorizeMemory()) + if (!LAI->canVectorizeMemory()) { + if (hasUncountableExitWithSideEffects()) { + reportVectorizationFailure( + "Cannot vectorize unsafe dependencies in uncountable exit loop with " + "side effects", + "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE, + TheLoop); + return false; + } + return canVectorizeIndirectUnsafeDependences(); + } if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { reportVectorizationFailure("We don't allow storing to uniform addresses", @@ -1530,7 +1542,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!isGuaranteedNotToBePoison(CurrV, AC, TheLoop->getLoopPredecessor() ->getTerminator() - ->getIterator())) + ->getIterator(), + DT)) return false; continue; } @@ -1754,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { } }; + bool HasSideEffects = false; for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { if (I.mayWriteToMemory()) { - // We don't support writes to memory. + if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) { + HasSideEffects = true; + continue; + } + + // We don't support complex writes to memory. reportVectorizationFailure( - "Writes to memory unsupported in early exit loops", - "Cannot vectorize early exit loop with writes to memory", + "Complex writes to memory unsupported in early exit loops", + "Cannot vectorize early exit loop with complex writes to memory", "WritesInEarlyExitLoop", ORE, TheLoop); return false; - } else if (!IsSafeOperation(&I)) { + } + + if (!IsSafeOperation(&I)) { reportVectorizationFailure("Early exit loop contains operations that " "cannot be speculatively executed", "UnsafeOperationsEarlyExitLoop", ORE, @@ -1776,15 +1797,37 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock && "Expected latch predecessor to be the early exiting block"); + SmallVector<LoadInst *, 4> NonDerefLoads; // TODO: Handle loops that may fault. - Predicates.clear(); - if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, - &Predicates)) { - reportVectorizationFailure( - "Loop may fault", - "Cannot vectorize potentially faulting early exit loop", - "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + if (!HasSideEffects) { + // Read-only loop. + Predicates.clear(); + if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads, + &Predicates)) { + reportVectorizationFailure( + "Loop may fault", "Cannot vectorize non-read-only early exit loop", + "NonReadOnlyEarlyExitLoop", ORE, TheLoop); + return false; + } + } else if (!canUncountableExitConditionLoadBeMoved( + SingleUncountableExitingBlock)) return false; + + // Check non-dereferenceable loads if any. + for (LoadInst *LI : NonDerefLoads) { + // Only support unit-stride access for now. + int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand()); + if (Stride != 1) { + reportVectorizationFailure( + "Loop contains potentially faulting strided load", + "Cannot vectorize early exit loop with " + "strided fault-only-first load", + "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop); + return false; + } + PotentiallyFaultingLoads.insert(LI); + LLVM_DEBUG(dbgs() << "LV: Found potentially faulting load: " << *LI + << "\n"); } [[maybe_unused]] const SCEV *SymbolicMaxBTC = @@ -1797,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { "backedge taken count: " << *SymbolicMaxBTC << '\n'); UncountableExitingBB = SingleUncountableExitingBlock; + UncountableExitWithSideEffects = HasSideEffects; + return true; +} + +bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved( + BasicBlock *ExitingBlock) { + // Try to find a load in the critical path for the uncountable exit condition. + // This is currently matching about the simplest form we can, expecting + // only one in-loop load, the result of which is directly compared against + // a loop-invariant value. + // FIXME: We're insisting on a single use for now, because otherwise we will + // need to make PHI nodes for other users. That can be done once the initial + // transform code lands. + auto *Br = cast<BranchInst>(ExitingBlock->getTerminator()); + + using namespace llvm::PatternMatch; + Instruction *L = nullptr; + Value *Ptr = nullptr; + Value *R = nullptr; + if (!match(Br->getCondition(), + m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))), + m_Value(R))))) { + reportVectorizationFailure( + "Early exit loop with store but no supported condition load", + "NoConditionLoadForEarlyExitLoop", ORE, TheLoop); + return false; + } + + // FIXME: Don't rely on operand ordering for the comparison. + if (!TheLoop->isLoopInvariant(R)) { + reportVectorizationFailure( + "Early exit loop with store but no supported condition load", + "NoConditionLoadForEarlyExitLoop", ORE, TheLoop); + return false; + } + + // Make sure that the load address is not loop invariant; we want an + // address calculation that we can rotate to the next vector iteration. + const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr); + if (!isa<SCEVAddRecExpr>(PtrScev)) { + reportVectorizationFailure( + "Uncountable exit condition depends on load with an address that is " + "not an add recurrence", + "EarlyExitLoadInvariantAddress", ORE, TheLoop); + return false; + } + + // FIXME: Support gathers after first-faulting load support lands. + SmallVector<const SCEVPredicate *, 4> Predicates; + LoadInst *Load = cast<LoadInst>(L); + if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC, + &Predicates)) { + reportVectorizationFailure( + "Loop may fault", + "Cannot vectorize potentially faulting early exit loop", + "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + return false; + } + + ICFLoopSafetyInfo SafetyInfo; + SafetyInfo.computeLoopSafetyInfo(TheLoop); + // We need to know that load will be executed before we can hoist a + // copy out to run just before the first iteration. + // FIXME: Currently, other restrictions prevent us from reaching this point + // with a loop where the uncountable exit condition is determined + // by a conditional load. + assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) && + "Unhandled control flow in uncountable exit loop with side effects"); + + // Prohibit any potential aliasing with any instruction in the loop which + // might store to memory. + // FIXME: Relax this constraint where possible. + for (auto *BB : TheLoop->blocks()) { + for (auto &I : *BB) { + if (&I == Load) + continue; + + if (I.mayWriteToMemory()) { + if (auto *SI = dyn_cast<StoreInst>(&I)) { + AliasResult AR = AA->alias(Ptr, SI->getPointerOperand()); + if (AR == AliasResult::NoAlias) + continue; + } + + reportVectorizationFailure( + "Cannot determine whether critical uncountable exit load address " + "does not alias with a memory write", + "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop); + return false; + } + } + } + return true; } @@ -1869,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { } else { if (!isVectorizableEarlyExitLoop()) { assert(!hasUncountableEarlyExit() && + !hasUncountableExitWithSideEffects() && "Must be false without vectorizable early-exit loop"); if (DoExtraAnalysis) Result = false; @@ -1887,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return false; } + // Bail out for state-changing loops with uncountable exits for now. + if (UncountableExitWithSideEffects) { + reportVectorizationFailure( + "Writes to memory unsupported in early exit loops", + "Cannot vectorize early exit loop with writes to memory", + "WritesInEarlyExitLoop", ORE, TheLoop); + return false; + } + if (Result) { LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" << (LAI->getRuntimePointerChecking()->Need diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 838476dcae66..d34d2ae7a0b3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -334,6 +334,10 @@ public: FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL)); } + VPExpandSCEVRecipe *createExpandSCEV(const SCEV *Expr) { + return tryInsertInstruction(new VPExpandSCEVRecipe(Expr)); + } + //===--------------------------------------------------------------------===// // RAII helpers. //===--------------------------------------------------------------------===// @@ -559,6 +563,20 @@ public: /// Emit remarks for recipes with invalid costs in the available VPlans. void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE); + /// Create a check to \p Plan to see if the vector loop should be executed + /// based on its trip count. + void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount) const; + + /// Update loop metadata and profile info for both the scalar remainder loop + /// and \p VectorLoop, if it exists. Keeps all loop hints from the original + /// loop on the vector loop and replaces vectorizer-specific metadata. + void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, + VPBasicBlock *HeaderVPBB, + bool VectorizingEpilogue, + unsigned EstimatedVFxUF, + bool DisableRuntimeUnroll); + protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is @@ -613,13 +631,15 @@ private: /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B. bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B, bool HasTail) const; + const VectorizationFactor &B, bool HasTail, + bool IsEpilogue = false) const; /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B in the context of vectorizing a loop with known \p MaxTripCount. bool isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, - const unsigned MaxTripCount, bool HasTail) const; + const unsigned MaxTripCount, bool HasTail, + bool IsEpilogue = false) const; /// Determines if we have the infrastructure to vectorize the loop and its /// epilogue, assuming the main loop is vectorized by \p VF. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a0f306c12754..3cff43a51029 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -165,15 +165,6 @@ using namespace SCEVPatternMatch; const char VerboseDebug[] = DEBUG_TYPE "-verbose"; #endif -/// @{ -/// Metadata attribute names -const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; -const char LLVMLoopVectorizeFollowupVectorized[] = - "llvm.loop.vectorize.followup_vectorized"; -const char LLVMLoopVectorizeFollowupEpilogue[] = - "llvm.loop.vectorize.followup_epilogue"; -/// @} - STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); @@ -500,26 +491,22 @@ public: InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, - ElementCount VecWidth, - ElementCount MinProfitableTripCount, - unsigned UnrollFactor, LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - GeneratedRTChecks &RTChecks, VPlan &Plan) + ElementCount VecWidth, unsigned UnrollFactor, + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, + VPlan &Plan) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC), - VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount), - UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM), - BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan), + VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), + Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan), VectorPHVPBB(cast<VPBasicBlock>( Plan.getVectorLoopRegion()->getSinglePredecessor())) {} virtual ~InnerLoopVectorizer() = default; - /// Create a new empty loop that will contain vectorized instructions later - /// on, while the old loop will be used as the scalar remainder. Control flow - /// is generated around the vectorized (and scalar epilogue) loops consisting - /// of various checks and bypasses. Return the pre-header block of the new - /// loop. In the case of epilogue vectorization, this function is overriden to - /// handle the more complex control flow around the loops. + /// Creates a basic block for the scalar preheader. Both + /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite + /// the method to create additional blocks and checks needed for epilogue + /// vectorization. virtual BasicBlock *createVectorizedLoopSkeleton(); /// Fix the vectorized code, taking care of header phi's, and more. @@ -536,38 +523,18 @@ public: /// count of the original loop for both main loop and epilogue vectorization. void setTripCount(Value *TC) { TripCount = TC; } - /// Return the additional bypass block which targets the scalar loop by - /// skipping the epilogue loop after completing the main loop. - BasicBlock *getAdditionalBypassBlock() const { - assert(AdditionalBypassBlock && - "Trying to access AdditionalBypassBlock but it has not been set"); - return AdditionalBypassBlock; - } - protected: friend class LoopVectorizationPlanner; - // Create a check to see if the vector loop should be executed - Value *createIterationCountCheck(ElementCount VF, unsigned UF) const; - - /// Emit a bypass check to see if the vector trip count is zero, including if - /// it overflows. - void emitIterationCountCheck(BasicBlock *Bypass); - - /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, - /// vector loop preheader, middle block and scalar preheader. - void createVectorLoopSkeleton(StringRef Prefix); + /// Create and return a new IR basic block for the scalar preheader whose name + /// is prefixed with \p Prefix. + BasicBlock *createScalarPreheader(StringRef Prefix); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. virtual void printDebugTracesAtStart() {} virtual void printDebugTracesAtEnd() {} - /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the - /// vector preheader and its predecessor, also connecting the new block to the - /// scalar preheader. - void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); - /// The original loop. Loop *OrigLoop; @@ -592,8 +559,6 @@ protected: /// vector elements. ElementCount VF; - ElementCount MinProfitableTripCount; - /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -603,18 +568,9 @@ protected: // --- Vectorization state --- - /// The vector-loop preheader. - BasicBlock *LoopVectorPreHeader = nullptr; - - /// The scalar-loop preheader. - BasicBlock *LoopScalarPreHeader = nullptr; - /// Trip count of the original loop. Value *TripCount = nullptr; - /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) - Value *VectorTripCount = nullptr; - /// The profitablity analysis. LoopVectorizationCostModel *Cost; @@ -626,11 +582,6 @@ protected: /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; - /// The additional bypass block which conditionally skips over the epilogue - /// loop after executing the main loop. Needed to resume inductions and - /// reductions during epilogue vectorization. - BasicBlock *AdditionalBypassBlock = nullptr; - VPlan &Plan; /// The vector preheader block of \p Plan, used as target for check blocks @@ -679,20 +630,8 @@ public: GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth, - MinProfitableTripCount, UnrollFactor, CM, BFI, PSI, - Checks, Plan), - EPI(EPI) {} - - // Override this function to handle the more complex control flow around the - // three loops. - BasicBlock *createVectorizedLoopSkeleton() final { - return createEpilogueVectorizedLoopSkeleton(); - } - - /// The interface for creating a vectorized skeleton using one of two - /// different strategies, each corresponding to one execution of the vplan - /// as described above. - virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + UnrollFactor, CM, BFI, PSI, Checks, Plan), + EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {} /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -701,6 +640,9 @@ public: /// iteration count of the loop is so small that the main vector loop is /// completely skipped. EpilogueLoopVectorizationInfo &EPI; + +protected: + ElementCount MinProfitableTripCount; }; /// A specialized derived class of inner loop vectorizer that performs @@ -720,14 +662,24 @@ public: BFI, PSI, Check, Plan, EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF) {} /// Implements the interface for creating a vectorized skeleton using the - /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final; + /// *main loop* strategy (i.e., the first pass of VPlan execution). + BasicBlock *createVectorizedLoopSkeleton() final; protected: + /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the + /// vector preheader and its predecessor, also connecting the new block to the + /// scalar preheader. + void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); + + // Create a check to see if the main vector loop should be executed + Value *createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF, + unsigned UF) const; + /// Emits an iteration count bypass check once for the main loop (when \p /// ForEpilogue is false) and once for the epilogue loop (when \p /// ForEpilogue is true). - BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); + BasicBlock *emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, + bool ForEpilogue); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; @@ -736,6 +688,11 @@ protected: // vectorization of *epilogue* loops in the process of vectorizing loops and // their epilogues. class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { + /// The additional bypass block which conditionally skips over the epilogue + /// loop after executing the main loop. Needed to resume inductions and + /// reductions during epilogue vectorization. + BasicBlock *AdditionalBypassBlock = nullptr; + public: EpilogueVectorizerEpilogueLoop( Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, @@ -749,14 +706,22 @@ public: TripCount = EPI.TripCount; } /// Implements the interface for creating a vectorized skeleton using the - /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final; + /// *epilogue loop* strategy (i.e., the second pass of VPlan execution). + BasicBlock *createVectorizedLoopSkeleton() final; + + /// Return the additional bypass block which targets the scalar loop by + /// skipping the epilogue loop after completing the main loop. + BasicBlock *getAdditionalBypassBlock() const { + assert(AdditionalBypassBlock && + "Trying to access AdditionalBypassBlock but it has not been set"); + return AdditionalBypassBlock; + } protected: /// Emits an iteration count bypass check after the main vector loop has /// finished to see if there are any iterations left to execute by either /// the vector epilogue or the scalar epilogue. - BasicBlock *emitMinimumVectorEpilogueIterCountCheck( + BasicBlock *emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert); void printDebugTracesAtStart() override; @@ -962,8 +927,8 @@ public: /// user options, for the given register kind. bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind); - /// \return True if register pressure should be calculated for the given VF. - bool shouldCalculateRegPressureForVF(ElementCount VF); + /// \return True if register pressure should be considered for the given VF. + bool shouldConsiderRegPressureForVF(ElementCount VF); /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as @@ -1159,7 +1124,10 @@ public: CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const { assert(!VF.isScalar() && "Expected vector VF"); - return CallWideningDecisions.at({CI, VF}); + auto I = CallWideningDecisions.find({CI, VF}); + if (I == CallWideningDecisions.end()) + return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0}; + return I->second; } /// Return True if instruction \p I is an optimizable truncate whose operand @@ -1682,7 +1650,9 @@ private: Instruction *I = dyn_cast<Instruction>(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I) || - getWideningDecision(I, VF) == CM_Scalarize) + getWideningDecision(I, VF) == CM_Scalarize || + (isa<CallInst>(I) && + getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1878,6 +1848,8 @@ public: "claimed checks are required"); } + SCEVExp.eraseDeadInstructions(SCEVCheckCond); + if (!MemCheckBlock && !SCEVCheckBlock) return; @@ -2030,7 +2002,7 @@ public: /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR /// outside VPlan. - std::pair<Value *, BasicBlock *> getSCEVChecks() { + std::pair<Value *, BasicBlock *> getSCEVChecks() const { using namespace llvm::PatternMatch; if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt())) return {nullptr, nullptr}; @@ -2040,7 +2012,7 @@ public: /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR /// outside VPlan. - std::pair<Value *, BasicBlock *> getMemRuntimeChecks() { + std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const { using namespace llvm::PatternMatch; if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt())) return {nullptr, nullptr}; @@ -2049,9 +2021,7 @@ public: /// Return true if any runtime checks have been added bool hasChecks() const { - using namespace llvm::PatternMatch; - return (SCEVCheckCond && !match(SCEVCheckCond, m_ZeroInt())) || - MemRuntimeCheckCond; + return getSCEVChecks().first || getMemRuntimeChecks().first; } }; } // namespace @@ -2276,7 +2246,8 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { return TTI.enableMaskedInterleavedAccessVectorization(); } -void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { +void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan( + BasicBlock *CheckIRBB) { // Note: The block with the minimum trip-count check is already connected // during earlier VPlan construction. VPBlockBase *ScalarPH = Plan.getScalarPreheader(); @@ -2300,8 +2271,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { } } -Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, - unsigned UF) const { +Value *EpilogueVectorizerMainLoop::createIterationCountCheck( + BasicBlock *VectorPH, ElementCount VF, unsigned UF) const { // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the // vector trip count is zero. This check also covers the case where adding one @@ -2312,7 +2283,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + BasicBlock *const TCCheckBlock = VectorPH; IRBuilder<InstSimplifyFolder> Builder( TCCheckBlock->getContext(), InstSimplifyFolder(TCCheckBlock->getDataLayout())); @@ -2371,25 +2342,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, return CheckMinIters; } -void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; - Value *CheckMinIters = createIterationCountCheck(VF, UF); - // Create new preheader for vector loop. - LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), - static_cast<DominatorTree *>(nullptr), LI, - nullptr, "vector.ph"); - - BranchInst &BI = - *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); - if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) - setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); - ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); - - assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() == - TCCheckBlock && - "Plan's entry must be TCCCheckBlock"); -} - /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All @@ -2410,20 +2362,19 @@ static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, return IRVPBB; } -void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { - LoopVectorPreHeader = OrigLoop->getLoopPreheader(); - assert(LoopVectorPreHeader && "Invalid loop structure"); +BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) { + BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); + assert(VectorPH && "Invalid loop structure"); assert((OrigLoop->getUniqueLatchExitBlock() || Cost->requiresScalarEpilogue(VF.isVector())) && "loops not exiting via the latch without required epilogue?"); - LoopScalarPreHeader = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - LI, nullptr, Twine(Prefix) + "scalar.ph"); // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock - // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar - // preheader may be unreachable at this point. Instead it is replaced in - // createVectorizedLoopSkeleton. + // wrapping the newly created scalar preheader here at the moment, because the + // Plan's scalar preheader may be unreachable at this point. Instead it is + // replaced in executePlan. + return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr, + Twine(Prefix) + "scalar.ph"); } /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV @@ -2464,54 +2415,9 @@ static void addFullyUnrolledInstructionsToIgnore( } BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { - /* - In this function we generate a new loop. The new loop will contain - the vectorized instructions while the old loop will continue to run the - scalar remainder. - - [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's - / | preheader are expanded here. Eventually all required SCEV - / | expansion should happen here. - / v - | [ ] <-- vector loop bypass (may consist of multiple blocks). - | / | - | / v - || [ ] <-- vector pre header. - |/ | - | v - | [ ] \ - | [ ]_| <-- vector loop (created during VPlan execution). - | | - | v - \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to - | | successors created during VPlan execution) - \/ | - /\ v - | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). - | | - (opt) v <-- edge from middle to exit iff epilogue is not required. - | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header - | | wrapped in VPIRBasicBlock). - \ | - \ v - >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) - ... - */ - - // Create an empty vector loop, and prepare basic blocks for the runtime - // checks. - createVectorLoopSkeleton(""); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. This check also covers the case where the - // backedge-taken count is uint##_max: adding one to it will overflow leading - // to an incorrect trip count of zero. In this (rare) case we will also jump - // to the scalar loop. - emitIterationCountCheck(LoopScalarPreHeader); - - replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); - return LoopVectorPreHeader; + // Create a new IR basic block for the scalar preheader. + BasicBlock *ScalarPH = createScalarPreheader(""); + return ScalarPH->getSinglePredecessor(); } namespace { @@ -2652,24 +2558,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Remove redundant induction instructions. cse(HeaderBB); - - // Set/update profile weights for the vector and remainder loops as original - // loop iterations are now distributed among them. Note that original loop - // becomes the scalar remainder loop after vectorization. - // - // For cases like foldTailByMasking() and requiresScalarEpiloque() we may - // end up getting slightly roughened result but that should be OK since - // profile is not inherently precise anyway. Note also possible bypass of - // vector code caused by legality checks is ignored, assigning all the weight - // to the vector loop, optimistically. - // - // For scalable vectorization we can't know at compile time how many - // iterations of the loop are handled in one vector iteration, so instead - // use the value of vscale used for tuning. - Loop *VectorLoop = LI->getLoopFor(HeaderBB); - unsigned EstimatedVFxUF = - estimateElementCount(VF * UF, Cost->getVScaleForTuning()); - setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); } void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { @@ -3020,19 +2908,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, toVectorTy(Type::getInt1Ty(I->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); - // Certain instructions can be cheaper to vectorize if they have a constant - // second vector operand. One example of this are shifts on x86. - Value *Op2 = I->getOperand(1); - auto Op2Info = TTI.getOperandInfo(Op2); - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && - Legal->isInvariant(Op2)) - Op2Info.Kind = TargetTransformInfo::OK_UniformValue; - SmallVector<const Value *, 4> Operands(I->operand_values()); SafeDivisorCost += TTI.getArithmeticInstrCost( - I->getOpcode(), VecTy, CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - Op2Info, Operands, I); + I->getOpcode(), VecTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + Operands, I); return {ScalarizationCost, SafeDivisorCost}; } @@ -3810,7 +3691,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } -bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF( +bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF( ElementCount VF) { if (!useMaxBandwidth(VF.isScalable() ? TargetTransformInfo::RGK_ScalableVector @@ -3939,7 +3820,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, const unsigned MaxTripCount, - bool HasTail) const { + bool HasTail, + bool IsEpilogue) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; @@ -3963,7 +3845,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. - bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && + bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) && A.Width.isScalable() && !B.Width.isScalable(); auto CmpFn = [PreferScalable](const InstructionCost &LHS, @@ -4001,10 +3883,11 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, - bool HasTail) const { + bool HasTail, + bool IsEpilogue) const { const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); - return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, - HasTail); + return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail, + IsEpilogue); } void LoopVectorizationPlanner::emitInvalidCostRemarks( @@ -4171,6 +4054,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenIntOrFpInductionSC: case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: + case VPDef::VPInterleaveEVLSC: case VPDef::VPInterleaveSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: @@ -4199,8 +4083,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, // If no def nor is a store, e.g., branches, continue - no value to check. if (R.getNumDefinedValues() == 0 && - !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( - &R)) + !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(&R)) continue; // For multi-def recipes, currently only interleaved loads, suffice to // check first def only. @@ -4255,8 +4138,9 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { P->vectorFactors().end()); SmallVector<VPRegisterUsage, 8> RUs; - if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) || - CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector)) + if (any_of(VFs, [this](ElementCount VF) { + return CM.shouldConsiderRegPressureForVF(VF); + })) RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore); for (unsigned I = 0; I < VFs.size(); I++) { @@ -4268,7 +4152,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { /// If the register pressure needs to be considered for VF, /// don't consider the VF as valid if it exceeds the number /// of registers for the target. - if (CM.shouldCalculateRegPressureForVF(VF) && + if (CM.shouldConsiderRegPressureForVF(VF) && RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) continue; @@ -4286,7 +4170,33 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { if (!VPI) continue; switch (VPI->getOpcode()) { - case VPInstruction::ActiveLaneMask: + // Selects are only modelled in the legacy cost model for safe + // divisors. + case Instruction::Select: { + VPValue *VPV = VPI->getVPSingleValue(); + if (VPV->getNumUsers() == 1) { + if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) { + switch (WR->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + continue; + default: + break; + } + } + } + C += VPI->cost(VF, CostCtx); + break; + } + case VPInstruction::ActiveLaneMask: { + unsigned Multiplier = + cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue()) + ->getZExtValue(); + C += VPI->cost(VF * Multiplier, CostCtx); + break; + } case VPInstruction::ExplicitVectorLength: C += VPI->cost(VF, CostCtx); break; @@ -4511,7 +4421,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( } if (Result.Width.isScalar() || - isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking())) + isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(), + /*IsEpilogue*/ true)) Result = NextVF; } @@ -5326,8 +5237,11 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); - const Value *Ptr = getLoadStorePointerOperand(I); - Type *PtrTy = toVectorTy(Ptr->getType(), VF); + Value *Ptr = getLoadStorePointerOperand(I); + Type *PtrTy = Ptr->getType(); + + if (!Legal->isUniform(Ptr, VF)) + PtrTy = toVectorTy(PtrTy, VF); return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, @@ -5483,7 +5397,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI::CastContextHint::None, CostKind, RedOp); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType, + CostKind); if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) @@ -5528,7 +5443,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType, + CostKind); InstructionCost ExtraExtCost = 0; if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; @@ -5547,7 +5463,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); + true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy, + CostKind); if (RedCost.isValid() && RedCost < MulCost + BaseCost) return I == RetI ? RedCost : 0; @@ -6262,10 +6179,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, assert(Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1); - SmallVector<const Value *, 2> Operands{Op0, Op1}; return TTI.getArithmeticInstrCost( - match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, - CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); + match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, + VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I); } Type *CondTy = SI->getCondition()->getType(); @@ -6495,7 +6411,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { })) continue; VecValuesToIgnore.insert(Op); - DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); + append_range(DeadInterleavePointerOps, Op->operands()); } for (const auto &[_, Ops] : DeadInvariantStoreOps) @@ -6555,7 +6471,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { ValuesToIgnore.insert(Op); VecValuesToIgnore.insert(Op); - DeadOps.append(Op->op_begin(), Op->op_end()); + append_range(DeadOps, Op->operands()); } // Ignore type-promoting instructions we identified during reduction @@ -6765,9 +6681,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { InstructionCost VPCostContext::getLegacyCost(Instruction *UI, ElementCount VF) const { - if (ForceTargetInstructionCost.getNumOccurrences()) - return InstructionCost(ForceTargetInstructionCost.getNumOccurrences()); - return CM.getInstructionCost(UI, VF); + InstructionCost Cost = CM.getInstructionCost(UI, VF); + if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences()) + return InstructionCost(ForceTargetInstructionCost); + return Cost; } bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I, @@ -7071,8 +6988,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { P->vectorFactors().end()); SmallVector<VPRegisterUsage, 8> RUs; - if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) || - CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector)) + if (any_of(VFs, [this](ElementCount VF) { + return CM.shouldConsiderRegPressureForVF(VF); + })) RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore); for (unsigned I = 0; I < VFs.size(); I++) { @@ -7098,7 +7016,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { InstructionCost Cost = cost(*P, VF); VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); - if (CM.shouldCalculateRegPressureForVF(VF) && + if (CM.shouldConsiderRegPressureForVF(VF) && RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) { LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width " << VF << " because it uses too many registers\n"); @@ -7146,40 +7064,6 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { return BestFactor; } -static void addRuntimeUnrollDisableMetaData(Loop *L) { - SmallVector<Metadata *, 4> MDs; - // Reserve first location for self reference to the LoopID metadata node. - MDs.push_back(nullptr); - bool IsUnrollMetadata = false; - MDNode *LoopID = L->getLoopID(); - if (LoopID) { - // First find existing loop unrolling disable metadata. - for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { - auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); - if (MD) { - const auto *S = dyn_cast<MDString>(MD->getOperand(0)); - IsUnrollMetadata = - S && S->getString().starts_with("llvm.loop.unroll.disable"); - } - MDs.push_back(LoopID->getOperand(I)); - } - } - - if (!IsUnrollMetadata) { - // Add runtime unroll disable metadata. - LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Metadata *, 1> DisableOperands; - DisableOperands.push_back( - MDString::get(Context, "llvm.loop.unroll.runtime.disable")); - MDNode *DisableNode = MDNode::get(Context, DisableOperands); - MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - L->setLoopID(NewLoopID); - } -} - static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) { using namespace VPlanPatternMatch; assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult && @@ -7193,7 +7077,7 @@ static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) { // epilog loop, fix the reduction's scalar PHI node by adding the incoming value // from the main vector loop. static void fixReductionScalarResumeWhenVectorizingEpilog( - VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) { + VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) { // Get the VPInstruction computing the reduction result in the middle block. // The first operand may not be from the middle block if it is not connected // to the scalar preheader. In that case, there's nothing to fix. @@ -7248,8 +7132,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( // When fixing reductions in the epilogue loop we should already have // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry // over the incoming values correctly. - auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiR, true)); - EpiResumePhi->setIncomingValueForBlock( + EpiResumePhi.setIncomingValueForBlock( BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); } @@ -7276,11 +7159,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( BestVPlan, BestVF, VScale); } - if (!VectorizingEpilogue) { - // Checks are the same for all VPlans, added to BestVPlan only for - // compactness. - attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights); - } + // Checks are the same for all VPlans, added to BestVPlan only for + // compactness. + attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights); // Retrieving VectorPH now when it's easier while VPlan still has Regions. VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader()); @@ -7291,6 +7172,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); + VPlanTransforms::cse(BestVPlan); VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); @@ -7327,8 +7209,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - BasicBlock *EntryBB = - cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock(); State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(), State.CFG.PrevBB->getSingleSuccessor()); @@ -7342,7 +7222,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // looked through single-entry phis. ScalarEvolution &SE = *PSE.getSE(); for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) { - if (Exit->getNumPredecessors() == 0) + if (!Exit->hasPredecessors()) continue; for (VPRecipeBase &PhiR : Exit->phis()) SE.forgetLcssaPhiWithNewPredecessor( @@ -7362,88 +7242,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // //===------------------------------------------------===// - // Move check blocks to their final position. - // TODO: Move as part of VPIRBB execute and update impacted tests. - if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second) - MemCheckBlock->moveAfter(EntryBB); - if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second) - SCEVCheckBlock->moveAfter(EntryBB); - BestVPlan.execute(&State); - // 2.5 When vectorizing the epilogue, fix reduction resume values from the - // additional bypass block. - if (VectorizingEpilogue) { - assert(!BestVPlan.hasEarlyExit() && - "Epilogue vectorisation not yet supported with early exits"); - BasicBlock *PH = OrigLoop->getLoopPreheader(); - BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); - for (auto *Pred : predecessors(PH)) { - for (PHINode &Phi : PH->phis()) { - if (Phi.getBasicBlockIndex(Pred) != -1) - continue; - Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred); - } - } - VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader(); - if (ScalarPH->getNumPredecessors() > 0) { - // If ScalarPH has predecessors, we may need to update its reduction - // resume values. - for (VPRecipeBase &R : ScalarPH->phis()) { - fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), State, - BypassBlock); - } - } - } - // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT); - if (HeaderVPBB) { - MDNode *OrigLoopID = OrigLoop->getLoopID(); - - std::optional<MDNode *> VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); - - Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); - if (VectorizedLoopID) { - L->setLoopID(*VectorizedLoopID); - } else { - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE); - Hints.setAlreadyVectorized(); - - // Check if it's EVL-vectorized and mark the corresponding metadata. - bool IsEVLVectorized = - llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) { - // Looking for the ExplictVectorLength VPInstruction. - if (const auto *VI = dyn_cast<VPInstruction>(&Recipe)) - return VI->getOpcode() == VPInstruction::ExplicitVectorLength; - return false; - }); - if (IsEVLVectorized) { - LLVMContext &Context = L->getHeader()->getContext(); - MDNode *LoopID = L->getLoopID(); - auto *IsEVLVectorizedMD = MDNode::get( - Context, - {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"), - MDString::get(Context, "evl")}); - MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {}, - {IsEVLVectorizedMD}); - L->setLoopID(NewLoopID); - } - } - TargetTransformInfo::UnrollingPreferences UP; - TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); - if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) - addRuntimeUnrollDisableMetaData(L); - } + // Add metadata to disable runtime unrolling a scalar loop when there + // are no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar(); + updateLoopMetadataAndProfileInfo( + HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB)) + : nullptr, + HeaderVPBB, VectorizingEpilogue, + estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()), + DisableRuntimeUnroll); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7460,15 +7274,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { - createVectorLoopSkeleton(""); +BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() { + BasicBlock *ScalarPH = createScalarPreheader(""); + BasicBlock *VectorPH = ScalarPH->getSinglePredecessor(); // Generate the code to check the minimum iteration count of the vector // epilogue (see below). EPI.EpilogueIterationCountCheck = - emitIterationCountCheck(LoopScalarPreHeader, true); + emitIterationCountCheck(VectorPH, ScalarPH, true); EPI.EpilogueIterationCountCheck->setName("iter.check"); + VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator()) + ->getSuccessor(1); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case // that goes directly through the vector epilogue. The longer-path length for @@ -7476,9 +7293,10 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { // trip count. Note: the branch will get updated later on when we vectorize // the epilogue. EPI.MainLoopIterationCountCheck = - emitIterationCountCheck(LoopScalarPreHeader, false); + emitIterationCountCheck(VectorPH, ScalarPH, false); - return LoopVectorPreHeader; + return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator()) + ->getSuccessor(1); } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -7498,35 +7316,33 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { }); } -BasicBlock * -EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, - bool ForEpilogue) { +BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck( + BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) { assert(Bypass && "Expected valid bypass basic block."); Value *Count = getTripCount(); MinProfitableTripCount = ElementCount::getFixed(0); - Value *CheckMinIters = - createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF, - ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF); + Value *CheckMinIters = createIterationCountCheck( + VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF, + ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF); - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + BasicBlock *const TCCheckBlock = VectorPH; if (!ForEpilogue) TCCheckBlock->setName("vector.main.loop.iter.check"); // Create new preheader for vector loop. - LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), - static_cast<DominatorTree *>(nullptr), LI, - nullptr, "vector.ph"); + VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), + static_cast<DominatorTree *>(nullptr), LI, nullptr, + "vector.ph"); if (ForEpilogue) { // Save the trip count so we don't have to regenerate it in the // vec.epilog.iter.check. This is safe to do because the trip count // generated here dominates the vector epilog iter check. EPI.TripCount = Count; } else { - VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); + VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH); } - BranchInst &BI = - *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); @@ -7546,19 +7362,18 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock * -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { - createVectorLoopSkeleton("vec.epilog."); - +BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() { + BasicBlock *ScalarPH = createScalarPreheader("vec.epilog."); + BasicBlock *VectorPH = ScalarPH->getSinglePredecessor(); // Now, compare the remaining count and if there aren't enough iterations to // execute the vectorized epilogue skip to the scalar part. - LoopVectorPreHeader->setName("vec.epilog.ph"); + VectorPH->setName("vec.epilog.ph"); BasicBlock *VecEpilogueIterationCountCheck = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, - nullptr, "vec.epilog.iter.check", true); - VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); + SplitBlock(VectorPH, VectorPH->begin(), DT, LI, nullptr, + "vec.epilog.iter.check", true); + VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH); - emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, + emitMinimumVectorEpilogueIterCountCheck(VectorPH, ScalarPH, VecEpilogueIterationCountCheck); AdditionalBypassBlock = VecEpilogueIterationCountCheck; @@ -7567,23 +7382,22 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass."); EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopVectorPreHeader); + VecEpilogueIterationCountCheck, VectorPH); EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); + VecEpilogueIterationCountCheck, ScalarPH); // Adjust the terminators of runtime check blocks and phis using them. BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second; BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second; if (SCEVCheckBlock) SCEVCheckBlock->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); + VecEpilogueIterationCountCheck, ScalarPH); if (MemCheckBlock) MemCheckBlock->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); + VecEpilogueIterationCountCheck, ScalarPH); - DT->changeImmediateDominator(LoopScalarPreHeader, - EPI.EpilogueIterationCountCheck); + DT->changeImmediateDominator(ScalarPH, EPI.EpilogueIterationCountCheck); // The vec.epilog.iter.check block may contain Phi nodes from inductions or // reductions which merge control-flow from the latch block and the middle @@ -7592,7 +7406,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis())); for (PHINode *Phi : PhisInBlock) { - Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt()); + Phi->moveBefore(VectorPH->getFirstNonPHIIt()); Phi->replaceIncomingBlockWith( VecEpilogueIterationCountCheck->getSinglePredecessor(), VecEpilogueIterationCountCheck); @@ -7612,12 +7426,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { Phi->removeIncomingValue(MemCheckBlock); } - return LoopVectorPreHeader; + return VectorPH; } BasicBlock * EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( - BasicBlock *Bypass, BasicBlock *Insert) { + BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert) { assert(EPI.TripCount && "Expected trip count to have been saved in the first pass."); @@ -7637,23 +7451,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( EPI.EpilogueVF, EPI.EpilogueUF), "min.epilog.iters.check"); - BranchInst &BI = - *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); - if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - auto VScale = Cost->getVScaleForTuning(); - unsigned MainLoopStep = - estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale); - unsigned EpilogueLoopStep = - estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale); - // We assume the remaining `Count` is equally distributed in - // [0, MainLoopStep) - // So the probability for `Count < EpilogueLoopStep` should be - // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep - unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); - const uint32_t Weights[] = {EstimatedSkipCount, - MainLoopStep - EstimatedSkipCount}; - setBranchWeights(BI, Weights, /*IsExpected=*/false); - } + BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters); + auto VScale = Cost->getVScaleForTuning(); + unsigned MainLoopStep = + estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale); + unsigned EpilogueLoopStep = + estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale); + // We assume the remaining `Count` is equally distributed in + // [0, MainLoopStep) + // So the probability for `Count < EpilogueLoopStep` should be + // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep + // TODO: Improve the estimate by taking the estimated trip count into + // consideration. + unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); + const uint32_t Weights[] = {EstimatedSkipCount, + MainLoopStep - EstimatedSkipCount}; + setBranchWeights(BI, Weights, /*IsExpected=*/false); ReplaceInstWithInst(Insert->getTerminator(), &BI); // A new entry block has been created for the epilogue VPlan. Hook it in, as @@ -8634,8 +8447,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( return !CM.requiresScalarEpilogue(VF.isVector()); }, Range); - VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(), - Range); + VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit()); VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck, CM.foldTailByMasking()); @@ -8761,10 +8573,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range); - if (!Recipe) { - SmallVector<VPValue *, 4> Operands(R.operands()); - Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range); - } + if (!Recipe) + Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range); RecipeBuilder.setRecipe(Instr, Recipe); if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) { @@ -8790,7 +8600,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // to remove the need to keep a map of masks beyond the predication // transform. RecipeBuilder.updateBlockMaskCache(Old2New); - for (const auto &[Old, _] : Old2New) + for (VPValue *Old : Old2New.keys()) Old->getDefiningRecipe()->eraseFromParent(); assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && @@ -8851,41 +8661,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); - // Replace VPValues for known constant strides guaranteed by predicate scalar - // evolution. - auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { - auto *R = cast<VPRecipeBase>(&U); - return R->getParent()->getParent() || - R->getParent() == - Plan->getVectorLoopRegion()->getSinglePredecessor(); - }; - for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { - auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); - auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); - // Only handle constant strides for now. - if (!ScevStride) - continue; - - auto *CI = Plan->getOrAddLiveIn( - ConstantInt::get(Stride->getType(), ScevStride->getAPInt())); - if (VPValue *StrideVPV = Plan->getLiveIn(StrideV)) - StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); - - // The versioned value may not be used in the loop directly but through a - // sext/zext. Add new live-ins in those cases. - for (Value *U : StrideV->users()) { - if (!isa<SExtInst, ZExtInst>(U)) - continue; - VPValue *StrideVPV = Plan->getLiveIn(U); - if (!StrideVPV) - continue; - unsigned BW = U->getType()->getScalarSizeInBits(); - APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW) - : ScevStride->getAPInt().zext(BW); - VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C)); - StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); - } - } + // Replace VPValues for known constant strides. + VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE, + Legal->getLAI()->getSymbolicStrides()); auto BlockNeedsPredication = [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); @@ -8926,7 +8704,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { OrigLoop, *LI, Legal->getWidestInductionType(), getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); VPlanTransforms::handleEarlyExits(*Plan, - /*HasUncountableExit*/ false, Range); + /*HasUncountableExit*/ false); VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true, /*TailFolded*/ false); @@ -9316,7 +9094,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( void LoopVectorizationPlanner::attachRuntimeChecks( VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const { const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks(); - if (SCEVCheckBlock) { + if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) { assert((!CM.OptForSize || CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) && "Cannot SCEV check stride or overflow when optimizing for size"); @@ -9324,7 +9102,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks( HasBranchWeights); } const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks(); - if (MemCheckBlock) { + if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) { // VPlan-native path does not do any analysis for runtime checks // currently. assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) && @@ -9350,6 +9128,29 @@ void LoopVectorizationPlanner::attachRuntimeChecks( } } +void LoopVectorizationPlanner::addMinimumIterationCheck( + VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount) const { + // vscale is not necessarily a power-of-2, which means we cannot guarantee + // an overflow to zero when updating induction variables and so an + // additional overflow check is required before entering the vector loop. + bool IsIndvarOverflowCheckNeededForVF = + VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() && + !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) && + CM.getTailFoldingStyle() != + TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + const uint32_t *BranchWeigths = + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()) + ? &MinItersBypassWeights[0] + : nullptr; + VPlanTransforms::addMinimumIterationCheck( + Plan, VF, UF, MinProfitableTripCount, + CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(), + IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths, + OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), + *PSE.getSE()); +} + void VPDerivedIVRecipe::execute(VPTransformState &State) { assert(!State.Lane && "VPDerivedIVRecipe being replicated."); @@ -9465,17 +9266,18 @@ static bool processLoopInVPlanNativePath( { GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); - InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM, + InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM, BFI, PSI, Checks, BestPlan); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); + LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1, + VF.MinProfitableTripCount); + + LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false); } reportVectorization(ORE, L, VF, 1); - // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(); assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -9929,6 +9731,43 @@ static Value *createInductionAdditionalBypassValues( return EndValueFromAdditionalBypass; } +static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, + VPlan &BestEpiPlan, + LoopVectorizationLegality &LVL, + const SCEV2ValueTy &ExpandedSCEVs, + Value *MainVectorTripCount) { + // Fix reduction resume values from the additional bypass block. + BasicBlock *PH = L->getLoopPreheader(); + for (auto *Pred : predecessors(PH)) { + for (PHINode &Phi : PH->phis()) { + if (Phi.getBasicBlockIndex(Pred) != -1) + continue; + Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred); + } + } + auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader()); + if (ScalarPH->hasPredecessors()) { + // If ScalarPH has predecessors, we may need to update its reduction + // resume values. + for (const auto &[R, IRPhi] : + zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) { + fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), IRPhi, + BypassBlock); + } + } + + // Fix induction resume values from the additional bypass block. + IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt()); + for (const auto &[IVPhi, II] : LVL.getInductionVars()) { + auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); + Value *V = createInductionAdditionalBypassValues( + IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount, + LVL.getPrimaryInduction()); + // TODO: Directly add as extra operand to the VPResumePHI recipe. + Inc->setIncomingValueForBlock(BypassBlock, V); + } +} + bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); @@ -9971,7 +9810,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, - &Requirements, &Hints, DB, AC, BFI, PSI); + &Requirements, &Hints, DB, AC, BFI, PSI, AA); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); @@ -9985,6 +9824,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } + if (!LVL.getPotentiallyFaultingLoads().empty()) { + reportVectorizationFailure("Auto-vectorization of loops with potentially " + "faulting load is not supported", + "PotentiallyFaultingLoadsNotSupported", ORE, L); + return false; + } + // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before // even evaluating whether vectorization is profitable. Since we cannot modify @@ -10251,128 +10097,80 @@ bool LoopVectorizePass::processLoop(Loop *L) { LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } - bool DisableRuntimeUnroll = false; - MDNode *OrigLoopID = L->getLoopID(); - { + // Report the vectorization decision. + if (VF.Width.isScalar()) { using namespace ore; - if (!VectorizeLoop) { - assert(IC > 1 && "interleave count should not be 1 or 0"); - // If we decided that it is not legal to vectorize the loop, then - // interleave it. - VPlan &BestPlan = LVP.getPlanFor(VF.Width); - InnerLoopVectorizer Unroller( - L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1), - ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan); - - // TODO: Move to general VPlan pipeline once epilogue loops are also - // supported. - VPlanTransforms::runPass( - VPlanTransforms::materializeConstantVectorTripCount, BestPlan, - VF.Width, IC, PSE); - - LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); + assert(IC > 1); + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"; + }); + } else { + // Report the vectorization decision. + reportVectorization(ORE, L, VF, IC); + } + if (ORE->allowExtraAnalysis(LV_NAME)) + checkMixedPrecision(L, ORE); - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), - L->getHeader()) - << "interleaved loop (interleaved count: " - << NV("InterleaveCount", IC) << ")"; - }); - } else { - // If we decided that it is *legal* to vectorize the loop, then do it. - - VPlan &BestPlan = LVP.getPlanFor(VF.Width); - // Consider vectorizing the epilogue too if it's profitable. - VectorizationFactor EpilogueVF = - LVP.selectEpilogueVectorizationFactor(VF.Width, IC); - if (EpilogueVF.Width.isVector()) { - std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); - - // The first pass vectorizes the main loop and creates a scalar epilogue - // to be vectorized by executing the plan (potentially with a different - // factor) again shortly afterwards. - VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); - BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block"); - preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); - EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, - BestEpiPlan); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, - BFI, PSI, Checks, *BestMainPlan); - auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, - *BestMainPlan, MainILV, DT, false); - ++LoopsVectorized; - - // Second pass vectorizes the epilogue and adjusts the control flow - // edges from the first pass. - EpilogueVectorizerEpilogueLoop EpilogILV( - L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan); - EpilogILV.setTripCount(MainILV.getTripCount()); - preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); - - LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, - DT, true); - - // Fix induction resume values from the additional bypass block. - BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock(); - IRBuilder<> BypassBuilder(BypassBlock, - BypassBlock->getFirstInsertionPt()); - BasicBlock *PH = L->getLoopPreheader(); - for (const auto &[IVPhi, II] : LVL.getInductionVars()) { - auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); - Value *V = createInductionAdditionalBypassValues( - IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount, - LVL.getPrimaryInduction()); - // TODO: Directly add as extra operand to the VPResumePHI recipe. - Inc->setIncomingValueForBlock(BypassBlock, V); - } - ++LoopsEpilogueVectorized; + // If we decided that it is *legal* to interleave or vectorize the loop, then + // do it. - if (!Checks.hasChecks()) - DisableRuntimeUnroll = true; - } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, - VF.MinProfitableTripCount, IC, &CM, BFI, PSI, - Checks, BestPlan); - // TODO: Move to general VPlan pipeline once epilogue loops are also - // supported. - VPlanTransforms::runPass( - VPlanTransforms::materializeConstantVectorTripCount, BestPlan, - VF.Width, IC, PSE); - - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there - // are no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!Checks.hasChecks()) - DisableRuntimeUnroll = true; - } - // Report the vectorization decision. - reportVectorization(ORE, L, VF, IC); - } + VPlan &BestPlan = LVP.getPlanFor(VF.Width); + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + LVP.selectEpilogueVectorizationFactor(VF.Width, IC); + if (EpilogueVF.Width.isVector()) { + std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); + BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block"); + preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); + EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, + BestEpiPlan); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, + PSI, Checks, *BestMainPlan); + auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, + *BestMainPlan, MainILV, DT, false); + ++LoopsVectorized; + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, + BFI, PSI, Checks, BestEpiPlan); + EpilogILV.setTripCount(MainILV.getTripCount()); + preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); + + LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT, + true); + + fixScalarResumeValuesFromBypass(EpilogILV.getAdditionalBypassBlock(), L, + BestEpiPlan, LVL, ExpandedSCEVs, + EPI.VectorTripCount); + ++LoopsEpilogueVectorized; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI, + Checks, BestPlan); + // TODO: Move to general VPlan pipeline once epilogue loops are also + // supported. + VPlanTransforms::runPass( + VPlanTransforms::materializeConstantVectorTripCount, BestPlan, VF.Width, + IC, PSE); + LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC, + VF.MinProfitableTripCount); - if (ORE->allowExtraAnalysis(LV_NAME)) - checkMixedPrecision(L, ORE); + LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); + ++LoopsVectorized; } assert(DT->verify(DominatorTree::VerificationLevel::Fast) && "DT not preserved correctly"); + assert(!verifyFunction(*F, &dbgs())); - std::optional<MDNode *> RemainderLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupEpilogue}); - if (RemainderLoopID) { - L->setLoopID(*RemainderLoopID); - } else { - if (DisableRuntimeUnroll) - addRuntimeUnrollDisableMetaData(L); - - // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(); - } - - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -10449,6 +10247,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, DB = &AM.getResult<DemandedBitsAnalysis>(F); ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); LAIs = &AM.getResult<LoopAccessAnalysis>(F); + AA = &AM.getResult<AAManager>(F); auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 37dc41413966..6a56dbfaa015 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -967,9 +967,7 @@ class BinOpSameOpcodeHelper { return false; } bool equal(unsigned Opcode) { - if (Opcode == I->getOpcode()) - return trySet(MainOpBIT, MainOpBIT); - return false; + return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT); } unsigned getOpcode() const { MaskType Candidate = Mask & SeenBefore; @@ -5576,7 +5574,23 @@ private: if (auto *SD = dyn_cast<ScheduleData>(Data)) { SD->setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); - ProcessBundleMember(SD, {}); + SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles; + SmallVector<ScheduleBundle *> Bundles; + Instruction *In = SD->getInst(); + if (R.isVectorized(In)) { + ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In); + for (TreeEntry *TE : Entries) { + if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(In) && + In->getNumOperands() != TE->getNumOperands()) + continue; + auto &BundlePtr = + PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>()); + BundlePtr->setTreeEntry(TE); + BundlePtr->add(SD); + Bundles.push_back(BundlePtr.get()); + } + } + ProcessBundleMember(SD, Bundles); } else { ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data); Bundle.setScheduled(/*Scheduled=*/true); @@ -6325,17 +6339,11 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) { } /// Checks if the provided list of pointers \p Pointers represents the strided -/// pointers for type ElemTy. If they are not, std::nullopt is returned. -/// Otherwise, if \p Inst is not specified, just initialized optional value is -/// returned to show that the pointers represent strided pointers. If \p Inst -/// specified, the runtime stride is materialized before the given \p Inst. -/// \returns std::nullopt if the pointers are not pointers with the runtime -/// stride, nullptr or actual stride value, otherwise. -static std::optional<Value *> -calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, - const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl<unsigned> &SortedIndices, - Instruction *Inst = nullptr) { +/// pointers for type ElemTy. If they are not, nullptr is returned. +/// Otherwise, SCEV* of the stride value is returned. +static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl<unsigned> &SortedIndices) { SmallVector<const SCEV *> SCEVs; const SCEV *PtrSCEVLowest = nullptr; const SCEV *PtrSCEVHighest = nullptr; @@ -6344,7 +6352,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, for (Value *Ptr : PointerOps) { const SCEV *PtrSCEV = SE.getSCEV(Ptr); if (!PtrSCEV) - return std::nullopt; + return nullptr; SCEVs.push_back(PtrSCEV); if (!PtrSCEVLowest && !PtrSCEVHighest) { PtrSCEVLowest = PtrSCEVHighest = PtrSCEV; @@ -6352,14 +6360,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, } const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); if (isa<SCEVCouldNotCompute>(Diff)) - return std::nullopt; + return nullptr; if (Diff->isNonConstantNegative()) { PtrSCEVLowest = PtrSCEV; continue; } const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV); if (isa<SCEVCouldNotCompute>(Diff1)) - return std::nullopt; + return nullptr; if (Diff1->isNonConstantNegative()) { PtrSCEVHighest = PtrSCEV; continue; @@ -6368,7 +6376,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, // Dist = PtrSCEVHighest - PtrSCEVLowest; const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest); if (isa<SCEVCouldNotCompute>(Dist)) - return std::nullopt; + return nullptr; int Size = DL.getTypeStoreSize(ElemTy); auto TryGetStride = [&](const SCEV *Dist, const SCEV *Multiplier) -> const SCEV * { @@ -6389,10 +6397,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1)); Stride = TryGetStride(Dist, Sz); if (!Stride) - return std::nullopt; + return nullptr; } if (!Stride || isa<SCEVConstant>(Stride)) - return std::nullopt; + return nullptr; // Iterate through all pointers and check if all distances are // unique multiple of Stride. using DistOrdPair = std::pair<int64_t, int>; @@ -6406,28 +6414,28 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); const SCEV *Coeff = TryGetStride(Diff, Stride); if (!Coeff) - return std::nullopt; + return nullptr; const auto *SC = dyn_cast<SCEVConstant>(Coeff); if (!SC || isa<SCEVCouldNotCompute>(SC)) - return std::nullopt; + return nullptr; if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest, SE.getMulExpr(Stride, SC))) ->isZero()) - return std::nullopt; + return nullptr; Dist = SC->getAPInt().getZExtValue(); } // If the strides are not the same or repeated, we can't vectorize. if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size()) - return std::nullopt; + return nullptr; auto Res = Offsets.emplace(Dist, Cnt); if (!Res.second) - return std::nullopt; + return nullptr; // Consecutive order if the inserted element is the last one. IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end(); ++Cnt; } if (Offsets.size() != SCEVs.size()) - return std::nullopt; + return nullptr; SortedIndices.clear(); if (!IsConsecutive) { // Fill SortedIndices array only if it is non-consecutive. @@ -6438,10 +6446,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, ++Cnt; } } - if (!Inst) - return nullptr; - SCEVExpander Expander(SE, DL, "strided-load-vec"); - return Expander.expandCodeFor(Stride, Stride->getType(), Inst); + return Stride; } static std::pair<InstructionCost, InstructionCost> @@ -8030,11 +8035,11 @@ void BoUpSLP::reorderTopToBottom() { // it is an attempt to reorder node with reused scalars but with // external uses. if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { - OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += + OrdersUses.try_emplace(OrdersType(), 0).first->second += ExternalUserReorderIndices.size(); } else { for (const OrdersType &ExtOrder : ExternalUserReorderIndices) - ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + ++OrdersUses.try_emplace(ExtOrder, 0).first->second; } // No other useful reorder data in this entry. if (Order.empty()) @@ -8054,9 +8059,9 @@ void BoUpSLP::reorderTopToBottom() { return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); }); fixupOrderingIndices(CurrentOrder); - ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; + ++OrdersUses.try_emplace(CurrentOrder, 0).first->second; } else { - ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + ++OrdersUses.try_emplace(Order, 0).first->second; } } if (OrdersUses.empty()) @@ -8480,12 +8485,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); }); fixupOrderingIndices(CurrentOrder); - OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += - NumOps; + OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps; } else { - OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; + OrdersUses.try_emplace(Order, 0).first->second += NumOps; } - auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); + auto Res = OrdersUses.try_emplace(OrdersType(), 0); const auto AllowsReordering = [&](const TreeEntry *TE) { if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || @@ -10639,8 +10643,19 @@ class InstructionsCompatibilityAnalysis { } } } - if (MainOp) + if (MainOp) { + // Do not match, if any copyable is a terminator from the same block as + // the main operation. + if (any_of(VL, [&](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return I && I->getParent() == MainOp->getParent() && + I->isTerminator(); + })) { + MainOp = nullptr; + return; + } MainOpcode = MainOp->getOpcode(); + } } /// Returns the idempotent value for the \p MainOp with the detected \p @@ -11013,7 +11028,10 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( } SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars); if (all_of(VL, [&](Value *V) { - return isa<PoisonValue>(V) || Values.contains(V); + return isa<PoisonValue>(V) || Values.contains(V) || + (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) && + LI->getLoopFor(S.getMainOp()->getParent()) && + isVectorized(V)); })) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n"); return ScalarsVectorizationLegality(S, /*IsLegal=*/false); @@ -17835,6 +17853,17 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL)))); } + Value *getVectorizedValue(const TreeEntry &E) { + Value *Vec = E.VectorizedValue; + if (!Vec->getType()->isIntOrIntVectorTy()) + return Vec; + return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) { + return !isa<PoisonValue>(V) && + !isKnownNonNegative( + V, SimplifyQuery(*R.DL)); + })); + } + public: ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R) : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {} @@ -18001,35 +18030,14 @@ public: /// Adds 2 input vectors (in form of tree entries) and the mask for their /// shuffling. void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { - Value *V1 = E1.VectorizedValue; - if (V1->getType()->isIntOrIntVectorTy()) - V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); - Value *V2 = E2.VectorizedValue; - if (V2->getType()->isIntOrIntVectorTy()) - V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); + Value *V1 = getVectorizedValue(E1); + Value *V2 = getVectorizedValue(E2); add(V1, V2, Mask); } /// Adds single input vector (in form of tree entry) and the mask for its /// shuffling. void add(const TreeEntry &E1, ArrayRef<int> Mask) { - Value *V1 = E1.VectorizedValue; - if (V1->getType()->isIntOrIntVectorTy()) - V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); + Value *V1 = getVectorizedValue(E1); add(V1, Mask); } /// Adds 2 input vectors and the mask for their shuffling. @@ -18178,14 +18186,7 @@ public: auto CreateSubVectors = [&](Value *Vec, SmallVectorImpl<int> &CommonMask) { for (auto [E, Idx] : SubVectors) { - Value *V = E->VectorizedValue; - if (V->getType()->isIntOrIntVectorTy()) - V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); + Value *V = getVectorizedValue(*E); unsigned InsertionIndex = Idx * getNumElements(ScalarTy); // Use scalar version of the SCalarType to correctly handle shuffles // for revectorization. The revectorization mode operates by the @@ -19526,11 +19527,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return cast<LoadInst>(V)->getPointerOperand(); }); OrdersType Order; - std::optional<Value *> Stride = - calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order, - &*Builder.GetInsertPoint()); + const SCEV *StrideSCEV = + calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order); + assert(StrideSCEV && "At this point stride should be known"); + SCEVExpander Expander(*SE, *DL, "strided-load-vec"); + Value *Stride = Expander.expandCodeFor( + StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint()); Value *NewStride = - Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true); + Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true); StrideVal = Builder.CreateMul( NewStride, ConstantInt::get( @@ -20519,7 +20523,9 @@ Value *BoUpSLP::vectorizeTree( !(GatheredLoadsEntriesFirst.has_value() && IE->Idx >= *GatheredLoadsEntriesFirst && VectorizableTree.front()->isGather() && - is_contained(VectorizableTree.front()->Scalars, I))) + is_contained(VectorizableTree.front()->Scalars, I)) && + !(!VectorizableTree.front()->isGather() && + VectorizableTree.front()->isCopyableElement(I))) continue; SmallVector<SelectInst *> LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { @@ -20782,6 +20788,14 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, continue; } auto *SD = cast<ScheduleData>(SE); + if (SD->hasValidDependencies() && + (!S.areInstructionsWithCopyableElements() || + !S.isCopyableElement(SD->getInst())) && + !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE && + EI.UserTE->hasState() && + (!EI.UserTE->hasCopyableElements() || + !EI.UserTE->isCopyableElement(SD->getInst()))) + SD->clearDirectDependencies(); for (const Use &U : SD->getInst()->operands()) { unsigned &NumOps = UserOpToNumOps @@ -20791,7 +20805,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (auto *Op = dyn_cast<Instruction>(U.get()); Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op, *SLP, NumOps)) { - if (ScheduleData *OpSD = getScheduleData(Op)) { + if (ScheduleData *OpSD = getScheduleData(Op); + OpSD && OpSD->hasValidDependencies()) { OpSD->clearDirectDependencies(); if (RegionHasStackSave || !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst())) @@ -20977,7 +20992,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, ScheduleCopyableDataMapByUsers.erase(I); ScheduleCopyableDataMap.erase(KV); // Need to recalculate dependencies for the actual schedule data. - if (ScheduleData *OpSD = getScheduleData(I)) { + if (ScheduleData *OpSD = getScheduleData(I); + OpSD && OpSD->hasValidDependencies()) { OpSD->clearDirectDependencies(); if (RegionHasStackSave || !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst())) @@ -21881,6 +21897,10 @@ bool BoUpSLP::collectValuesToDemote( return TryProcessInstruction(BitWidth); case Instruction::ZExt: case Instruction::SExt: + if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() && + E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast && + E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy()) + return false; IsProfitableToDemote = true; return TryProcessInstruction(BitWidth); @@ -23797,9 +23817,7 @@ public: size_t Key, Idx; std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey, /*AllowAlternate=*/false); - ++PossibleReducedVals[Key][Idx] - .insert(std::make_pair(V, 0)) - .first->second; + ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second; } for (Instruction *I : reverse(PossibleReductionOps)) Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1); @@ -23820,21 +23838,20 @@ public: stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { return P1.size() > P2.size(); }); - int NewIdx = -1; + bool First = true; for (ArrayRef<Value *> Data : PossibleRedValsVect) { - if (NewIdx < 0 || - (!isGoodForReduction(Data) && - (!isa<LoadInst>(Data.front()) || - !isa<LoadInst>(ReducedVals[NewIdx].front()) || - getUnderlyingObject( - cast<LoadInst>(Data.front())->getPointerOperand()) != - getUnderlyingObject( - cast<LoadInst>(ReducedVals[NewIdx].front()) - ->getPointerOperand())))) { - NewIdx = ReducedVals.size(); + if (First) { + First = false; ReducedVals.emplace_back(); + } else if (!isGoodForReduction(Data)) { + auto *LI = dyn_cast<LoadInst>(Data.front()); + auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front()); + if (!LI || !LastLI || + getUnderlyingObject(LI->getPointerOperand()) != + getUnderlyingObject(LastLI->getPointerOperand())) + ReducedVals.emplace_back(); } - ReducedVals[NewIdx].append(Data.rbegin(), Data.rend()); + ReducedVals.back().append(Data.rbegin(), Data.rend()); } } // Sort the reduced values by number of same/alternate opcode and/or pointer @@ -23847,7 +23864,8 @@ public: /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, - const TargetLibraryInfo &TLI, AssumptionCache *AC) { + const TargetLibraryInfo &TLI, AssumptionCache *AC, + DominatorTree &DT) { constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; // If there are a sufficient number of reduction values, reduce @@ -24164,9 +24182,7 @@ public: // previous vectorization attempts. if (any_of(VL, [&V](Value *RedVal) { auto *RedValI = dyn_cast<Instruction>(RedVal); - if (!RedValI) - return false; - return V.isDeleted(RedValI); + return RedValI && V.isDeleted(RedValI); })) break; V.buildTree(VL, IgnoreList); @@ -24248,7 +24264,7 @@ public: // Estimate cost. InstructionCost ReductionCost = - getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V); + getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI); InstructionCost Cost = V.getTreeCost(VL, ReductionCost); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); @@ -24553,7 +24569,9 @@ private: InstructionCost getReductionCost(TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals, bool IsCmpSelMinMax, FastMathFlags FMF, - const BoUpSLP &R) { + const BoUpSLP &R, DominatorTree &DT, + const DataLayout &DL, + const TargetLibraryInfo &TLI) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *ScalarTy = ReducedVals.front()->getType(); unsigned ReduxWidth = ReducedVals.size(); @@ -24578,6 +24596,22 @@ private: for (User *U : RdxVal->users()) { auto *RdxOp = cast<Instruction>(U); if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { + if (RdxKind == RecurKind::FAdd) { + InstructionCost FMACost = canConvertToFMA( + RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI); + if (FMACost.isValid()) { + LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n"); + if (auto *I = dyn_cast<Instruction>(RdxVal)) { + // Also, exclude scalar fmul cost. + InstructionCost FMulCost = + TTI->getInstructionCost(I, CostKind); + LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + ScalarCost += FMACost; + continue; + } + } ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); continue; } @@ -24642,8 +24676,45 @@ private: auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); VectorType *RVecTy = getWidenedType(RType, ReduxWidth); - VectorCost += - TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); + InstructionCost FMACost = InstructionCost::getInvalid(); + if (RdxKind == RecurKind::FAdd) { + // Check if the reduction operands can be converted to FMA. + SmallVector<Value *> Ops; + FastMathFlags FMF; + FMF.set(); + for (Value *RdxVal : ReducedVals) { + if (!RdxVal->hasOneUse()) { + Ops.clear(); + break; + } + if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal)) + FMF &= FPCI->getFastMathFlags(); + Ops.push_back(RdxVal->user_back()); + } + if (!Ops.empty()) { + FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL, + *TTI, TLI); + if (FMACost.isValid()) { + // Calculate actual FMAD cost. + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy, + {RVecTy, RVecTy, RVecTy}, FMF); + FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind); + + LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n"); + // Also, exclude vector fmul cost. + InstructionCost FMulCost = TTI->getArithmeticInstrCost( + Instruction::FMul, RVecTy, CostKind); + LLVM_DEBUG(dbgs() + << "Minus vector FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + } + } + if (FMACost.isValid()) + VectorCost += FMACost; + else + VectorCost += + TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); if (RType != RedTy) { unsigned Opcode = Instruction::Trunc; if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits()) @@ -25311,7 +25382,7 @@ bool SLPVectorizerPass::vectorizeHorReduction( HorizontalReduction HorRdx; if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) return nullptr; - return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); + return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT); }; auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { if (TryOperandsAsNewSeeds && FutureSeed == Root) { @@ -25456,7 +25527,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (RedCost >= ScalarCost) return false; - return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr; + return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr; }; if (Candidates.size() == 1) return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R); @@ -25540,7 +25611,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, template <typename T> static bool tryToVectorizeSequence( SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator, - function_ref<bool(T *, T *)> AreCompatible, + function_ref<bool(ArrayRef<T *>, T *)> AreCompatible, function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R) { bool Changed = false; @@ -25562,7 +25633,7 @@ static bool tryToVectorizeSequence( auto *SameTypeIt = IncIt; while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) || R.isDeleted(cast<Instruction>(*SameTypeIt)) || - AreCompatible(*SameTypeIt, *IncIt))) { + AreCompatible(VL, *SameTypeIt))) { auto *I = dyn_cast<Instruction>(*SameTypeIt); ++SameTypeIt; if (I && !R.isDeleted(I)) @@ -25760,10 +25831,10 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts, return compareCmp<false>(V, V2, *TLI, *DT); }; - auto AreCompatibleCompares = [&](Value *V1, Value *V2) { - if (V1 == V2) + auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) { + if (VL.empty() || VL.back() == V1) return true; - return compareCmp<true>(V1, V2, *TLI, *DT); + return compareCmp<true>(V1, VL.back(), *TLI, *DT); }; SmallVector<Value *> Vals; @@ -25969,9 +26040,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } return false; }; - auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) { - if (V1 == V2) + auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL, + Value *V1) { + if (VL.empty() || V1 == VL.back()) return true; + Value *V2 = VL.back(); if (V1->getType() != V2->getType()) return false; ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; @@ -26061,7 +26134,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { InstSetVector PostProcessInserts; SmallSetVector<CmpInst *, 8> PostProcessCmps; - // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true + // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true // also vectorizes `PostProcessCmps`. auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) { bool Changed = vectorizeInserts(PostProcessInserts, BB, R); @@ -26342,7 +26415,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { V2->getValueOperand()->getValueID(); }; - auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { + bool SameParent = true; + auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) { + if (VL.empty()) { + SameParent = true; + return true; + } + StoreInst *V2 = VL.back(); if (V1 == V2) return true; if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType()) @@ -26353,15 +26432,34 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (isa<UndefValue>(V1->getValueOperand()) || isa<UndefValue>(V2->getValueOperand())) return true; - if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand())) - if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { - if (I1->getParent() != I2->getParent()) - return false; - return getSameOpcode({I1, I2}, *TLI).valid(); - } if (isa<Constant>(V1->getValueOperand()) && isa<Constant>(V2->getValueOperand())) return true; + // Check if the operands of the stores can be vectorized. They can be + // vectorized, if they have compatible operands or have operands, which can + // be vectorized as copyables. + auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()); + auto *I2 = dyn_cast<Instruction>(V2->getValueOperand()); + if (I1 || I2) { + // Accept only tail-following non-compatible values for now. + // TODO: investigate if it is possible to vectorize incompatible values, + // if the copyables are first in the list. + if (I1 && !I2) + return false; + SameParent &= I1 && I2 && I1->getParent() == I2->getParent(); + SmallVector<Value *> NewVL(VL.size() + 1); + for (auto [SI, V] : zip(VL, NewVL)) + V = SI->getValueOperand(); + NewVL.back() = V1->getValueOperand(); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true, + /*SkipSameCodeCheck=*/!SameParent); + if (S) + return true; + if (!SameParent) + return false; + } return V1->getValueOperand()->getValueID() == V2->getValueOperand()->getValueID(); }; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f972efa07eb7..16b1b539345d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -45,6 +45,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <cassert> #include <string> @@ -55,6 +56,15 @@ namespace llvm { extern cl::opt<bool> EnableVPlanNativePath; } +/// @{ +/// Metadata attribute names +const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; +const char LLVMLoopVectorizeFollowupVectorized[] = + "llvm.loop.vectorize.followup_vectorized"; +const char LLVMLoopVectorizeFollowupEpilogue[] = + "llvm.loop.vectorize.followup_epilogue"; +/// @} + extern cl::opt<unsigned> ForceTargetInstructionCost; static cl::opt<bool> PrintVPlansInDotFormat( @@ -143,7 +153,7 @@ template <typename T> static T *getPlanEntry(T *Start) { for (unsigned i = 0; i < WorkList.size(); i++) { T *Current = WorkList[i]; - if (Current->getNumPredecessors() == 0) + if (!Current->hasPredecessors()) return Current; auto &Predecessors = Current->getPredecessors(); WorkList.insert_range(Predecessors); @@ -216,7 +226,7 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB, // If VPBB is in a region R, VPBB is a loop header if R is a loop region with // VPBB as its entry, i.e., free of predecessors. if (auto *R = VPBB->getParent()) - return !R->isReplicator() && VPBB->getNumPredecessors() == 0; + return !R->isReplicator() && !VPBB->hasPredecessors(); // A header dominates its second predecessor (the latch), with the other // predecessor being the preheader @@ -493,6 +503,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { void VPIRBasicBlock::execute(VPTransformState *State) { assert(getHierarchicalSuccessors().size() <= 2 && "VPIRBasicBlock can have at most two successors at the moment!"); + // Move completely disconnected blocks to their final position. + if (IRBB->hasNPredecessors(0) && succ_begin(IRBB) == succ_end(IRBB)) + IRBB->moveAfter(State->CFG.PrevBB); State->Builder.SetInsertPoint(IRBB->getTerminator()); State->CFG.PrevBB = IRBB; State->CFG.VPBB2IRBB[this] = IRBB; @@ -809,7 +822,7 @@ InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) { const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const { const VPBlockBase *Pred = nullptr; - if (getNumPredecessors() > 0) { + if (hasPredecessors()) { Pred = getPredecessors()[Idx]; } else { auto *Region = getParent(); @@ -1183,14 +1196,14 @@ VPlan *VPlan::duplicate() { BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock(); VPIRBasicBlock *NewScalarHeader = nullptr; - if (getScalarHeader()->getNumPredecessors() == 0) { - NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB); - } else { + if (getScalarHeader()->hasPredecessors()) { NewScalarHeader = cast<VPIRBasicBlock>(*find_if( vp_depth_first_shallow(NewEntry), [ScalarHeaderIRBB](VPBlockBase *VPB) { auto *VPIRBB = dyn_cast<VPIRBasicBlock>(VPB); return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB; })); + } else { + NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB); } // Create VPlan, clone live-ins and remap operands in the cloned blocks. auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader); @@ -1473,7 +1486,7 @@ void VPSlotTracker::assignName(const VPValue *V) { std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str(); // First assign the base name for V. - const auto &[A, _] = VPValue2Name.insert({V, BaseName}); + const auto &[A, _] = VPValue2Name.try_emplace(V, BaseName); // Integer or FP constants with different types will result in he same string // due to stripping types. if (V->isLiveIn() && isa<ConstantInt, ConstantFP>(UV)) @@ -1481,7 +1494,7 @@ void VPSlotTracker::assignName(const VPValue *V) { // If it is already used by C > 0 other VPValues, increase the version counter // C and use it for V. - const auto &[C, UseInserted] = BaseName2Version.insert({BaseName, 0}); + const auto &[C, UseInserted] = BaseName2Version.try_emplace(BaseName, 0); if (!UseInserted) { C->second++; A->second = (BaseName + Twine(".") + Twine(C->second)).str(); @@ -1612,6 +1625,123 @@ VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const { llvm_unreachable("No plan found!"); } +static void addRuntimeUnrollDisableMetaData(Loop *L) { + SmallVector<Metadata *, 4> MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + bool IsUnrollMetadata = false; + MDNode *LoopID = L->getLoopID(); + if (LoopID) { + // First find existing loop unrolling disable metadata. + for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { + auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); + if (MD) { + const auto *S = dyn_cast<MDString>(MD->getOperand(0)); + if (!S) + continue; + if (S->getString().starts_with("llvm.loop.unroll.runtime.disable")) + continue; + IsUnrollMetadata = + S->getString().starts_with("llvm.loop.unroll.disable"); + } + MDs.push_back(LoopID->getOperand(I)); + } + } + + if (!IsUnrollMetadata) { + // Add runtime unroll disable metadata. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + } +} + +void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo( + Loop *VectorLoop, VPBasicBlock *HeaderVPBB, bool VectorizingEpilogue, + unsigned EstimatedVFxUF, bool DisableRuntimeUnroll) { + MDNode *LID = OrigLoop->getLoopID(); + // Update the metadata of the scalar loop. Skip the update when vectorizing + // the epilogue loop, to ensure it is only updated once. + if (!VectorizingEpilogue) { + std::optional<MDNode *> RemainderLoopID = makeFollowupLoopID( + LID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); + if (RemainderLoopID) { + OrigLoop->setLoopID(*RemainderLoopID); + } else { + if (DisableRuntimeUnroll) + addRuntimeUnrollDisableMetaData(OrigLoop); + + LoopVectorizeHints Hints(OrigLoop, true, *ORE); + Hints.setAlreadyVectorized(); + } + } + + if (!VectorLoop) + return; + + if (std::optional<MDNode *> VectorizedLoopID = + makeFollowupLoopID(LID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized})) { + VectorLoop->setLoopID(*VectorizedLoopID); + } else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (LID) + VectorLoop->setLoopID(LID); + + if (!VectorizingEpilogue) { + LoopVectorizeHints Hints(VectorLoop, true, *ORE); + Hints.setAlreadyVectorized(); + } + + // Check if it's EVL-vectorized and mark the corresponding metadata. + bool IsEVLVectorized = + llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) { + // Looking for the ExplictVectorLength VPInstruction. + if (const auto *VI = dyn_cast<VPInstruction>(&Recipe)) + return VI->getOpcode() == VPInstruction::ExplicitVectorLength; + return false; + }); + if (IsEVLVectorized) { + LLVMContext &Context = VectorLoop->getHeader()->getContext(); + MDNode *LoopID = VectorLoop->getLoopID(); + auto *IsEVLVectorizedMD = MDNode::get( + Context, + {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"), + MDString::get(Context, "evl")}); + MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {}, + {IsEVLVectorizedMD}); + VectorLoop->setLoopID(NewLoopID); + } + } + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(VectorLoop, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) + addRuntimeUnrollDisableMetaData(VectorLoop); + + // Set/update profile weights for the vector and remainder loops as original + // loop iterations are now distributed among them. Note that original loop + // becomes the scalar remainder loop after vectorization. + // + // For cases like foldTailByMasking() and requiresScalarEpiloque() we may + // end up getting slightly roughened result but that should be OK since + // profile is not inherently precise anyway. Note also possible bypass of + // vector code caused by legality checks is ignored, assigning all the weight + // to the vector loop, optimistically. + // + // For scalable vectorization we can't know at compile time how many + // iterations of the loop are handled in one vector iteration, so instead + // use the value of vscale used for tuning. + setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LoopVectorizationPlanner::printPlans(raw_ostream &O) { if (VPlans.empty()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index d6bc462a0dfa..53291a931530 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -219,6 +219,9 @@ public: size_t getNumSuccessors() const { return Successors.size(); } size_t getNumPredecessors() const { return Predecessors.size(); } + /// Returns true if this block has any predecessors. + bool hasPredecessors() const { return !Predecessors.empty(); } + /// An Enclosing Block of a block B is any block containing B, including B /// itself. \return the closest enclosing block starting from "this", which /// has successors. \return the root enclosing block if all enclosing blocks @@ -400,7 +403,7 @@ class LLVM_ABI_FOR_TEST VPRecipeBase public: VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPDef(SC), VPUser(Operands), DL(DL) {} virtual ~VPRecipeBase() = default; @@ -518,11 +521,11 @@ protected: class VPSingleDefRecipe : public VPRecipeBase, public VPValue { public: VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeBase(SC, Operands, DL), VPValue(this) {} VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands, - Value *UV, DebugLoc DL = {}) + Value *UV, DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {} static inline bool classof(const VPRecipeBase *R) { @@ -557,6 +560,7 @@ public: case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: + case VPRecipeBase::VPInterleaveEVLSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: case VPRecipeBase::VPWidenLoadEVLSC: @@ -712,12 +716,15 @@ public: VPIRFlags(GEPNoWrapFlags GEPFlags) : OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {} -public: void transferFlags(VPIRFlags &Other) { OpType = Other.OpType; AllFlags = Other.AllFlags; } + /// Only keep flags also present in \p Other. \p Other must have the same + /// OpType as the current object. + void intersectFlags(const VPIRFlags &Other); + /// Drop all poison-generating flags. void dropPoisonGeneratingFlags() { // NOTE: This needs to be kept in-sync with @@ -864,7 +871,7 @@ public: /// using IR flags. struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags() {} VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands, @@ -872,7 +879,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()), VPIRFlags(I) {} VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = {}) + const VPIRFlags &Flags, + DebugLoc DL = DebugLoc::getUnknown()) : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags(Flags) {} static inline bool classof(const VPRecipeBase *R) { @@ -900,6 +908,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { return R && classof(R); } + static inline bool classof(const VPSingleDefRecipe *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && classof(R); + } + void execute(VPTransformState &State) override = 0; /// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx. @@ -975,6 +988,10 @@ public: Not, SLPLoad, SLPStore, + // Creates a mask where each lane is active (true) whilst the current + // counter (first operand + index) is less than the second operand. i.e. + // mask[i] = icmpt ult (op0 + i), op1 + // The size of the mask returned is VF * Multiplier (UF, third op). ActiveLaneMask, ExplicitVectorLength, CalculateTripCountMinusVF, @@ -1014,7 +1031,8 @@ public: // Returns a scalar boolean value, which is true if any lane of its // (boolean) vector operands is true. It produces the reduced value across // all unrolled iterations. Unrolling will add all copies of its original - // operand as additional operands. + // operand as additional operands. AnyOf is poison-safe as all operands + // will be frozen. AnyOf, // Calculates the first active lane index of the vector predicate operands. // It produces the lane index across all unrolled iterations. Unrolling will @@ -1080,13 +1098,13 @@ private: #endif public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {}, - const Twine &Name = "") + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = {}, + const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = ""); VP_CLASSOF_IMPL(VPDef::VPInstructionSC) @@ -1479,7 +1497,8 @@ public: } VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, - const VPIRFlags &Flags = {}, DebugLoc DL = {}) + const VPIRFlags &Flags = {}, + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, Flags, DL), VPIRMetadata(), Opcode(Opcode), ResultTy(ResultTy) { assert(flagsValidForOpcode(Opcode) && @@ -1537,7 +1556,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { public: VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID, ArrayRef<VPValue *> CallArguments, Type *Ty, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI), VPIRMetadata(CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), MayReadFromMemory(CI.mayReadFromMemory()), @@ -1546,7 +1565,7 @@ public: VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, ArrayRef<VPValue *> CallArguments, Type *Ty, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL), VPIRMetadata(), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) { LLVMContext &Ctx = Ty->getContext(); @@ -1615,7 +1634,8 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags, public: VPWidenCallRecipe(Value *UV, Function *Variant, - ArrayRef<VPValue *> CallArguments, DebugLoc DL = {}) + ArrayRef<VPValue *> CallArguments, + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments, *cast<Instruction>(UV)), VPIRMetadata(*cast<Instruction>(UV)), Variant(Variant) { @@ -1644,10 +1664,8 @@ public: return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); } - operand_range args() { return make_range(op_begin(), std::prev(op_end())); } - const_operand_range args() const { - return make_range(op_begin(), std::prev(op_end())); - } + operand_range args() { return drop_end(operands()); } + const_operand_range args() const { return drop_end(operands()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -1667,7 +1685,7 @@ class VPHistogramRecipe : public VPRecipeBase { public: VPHistogramRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {} ~VPHistogramRecipe() override = default; @@ -1998,6 +2016,9 @@ public: return getOperand(1); } + /// Update the incoming value from the loop backedge. + void setBackedgeValue(VPValue *V) { setOperand(1, V); } + /// Returns the backedge value as a recipe. The backedge value is guaranteed /// to be a recipe. virtual VPRecipeBase &getBackedgeRecipe() { @@ -2229,8 +2250,8 @@ protected: public: /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and /// debug location \p DL. - VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = {}, - const Twine &Name = "") + VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL), Name(Name.str()) { if (Start) @@ -2381,9 +2402,8 @@ public: } VPBlendRecipe *clone() override { - SmallVector<VPValue *> Ops(operands()); - return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), Ops, - getDebugLoc()); + return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), + operands(), getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPBlendSC) @@ -2409,6 +2429,12 @@ public: return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized()); } + /// Set mask number \p Idx to \p V. + void setMask(unsigned Idx, VPValue *V) { + assert((Idx > 0 || !isNormalized()) && "First index has no mask!"); + Idx == 0 ? setOperand(1, V) : setOperand(Idx * 2 + !isNormalized(), V); + } + void execute(VPTransformState &State) override { llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends"); } @@ -2434,12 +2460,13 @@ public: } }; -/// VPInterleaveRecipe is a recipe for transforming an interleave group of load -/// or stores into one wide load/store and shuffles. The first operand of a -/// VPInterleave recipe is the address, followed by the stored values, followed -/// by an optional mask. -class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase, - public VPIRMetadata { +/// A common base class for interleaved memory operations. +/// An Interleaved memory operation is a memory access method that combines +/// multiple strided loads/stores into a single wide load/store with shuffles. +/// The first operand is the start address. The optional operands are, in order, +/// the stored values and the mask. +class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase, + public VPIRMetadata { const InterleaveGroup<Instruction> *IG; /// Indicates if the interleave group is in a conditional block and requires a @@ -2450,12 +2477,14 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase, /// unusued gaps can be loaded speculatively. bool NeedsMaskForGaps = false; -public: - VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, - ArrayRef<VPValue *> StoredValues, VPValue *Mask, - bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL) - : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, DL), VPIRMetadata(MD), - IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) { +protected: + VPInterleaveBase(const unsigned char SC, + const InterleaveGroup<Instruction> *IG, + ArrayRef<VPValue *> Operands, + ArrayRef<VPValue *> StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL) + : VPRecipeBase(SC, Operands, DL), VPIRMetadata(MD), IG(IG), + NeedsMaskForGaps(NeedsMaskForGaps) { // TODO: extend the masked interleaved-group support to reversed access. assert((!Mask || !IG->isReverse()) && "Reversed masked interleave-group not supported."); @@ -2473,14 +2502,19 @@ public: addOperand(Mask); } } - ~VPInterleaveRecipe() override = default; - VPInterleaveRecipe *clone() override { - return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), - NeedsMaskForGaps, *this, getDebugLoc()); +public: + VPInterleaveBase *clone() override = 0; + + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPInterleaveSC || + R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC; } - VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && classof(R); + } /// Return the address accessed by this recipe. VPValue *getAddr() const { @@ -2490,48 +2524,130 @@ public: /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is optional and therefore the last, currently 2nd operand. + // Mask is optional and the last operand. return HasMask ? getOperand(getNumOperands() - 1) : nullptr; } + /// Return true if the access needs a mask because of the gaps. + bool needsMaskForGaps() const { return NeedsMaskForGaps; } + + const InterleaveGroup<Instruction> *getInterleaveGroup() const { return IG; } + + Instruction *getInsertPos() const { return IG->getInsertPos(); } + + void execute(VPTransformState &State) override { + llvm_unreachable("VPInterleaveBase should not be instantiated."); + } + + /// Return the cost of this recipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + virtual bool onlyFirstLaneUsed(const VPValue *Op) const override = 0; + + /// Returns the number of stored operands of this interleave group. Returns 0 + /// for load interleave groups. + virtual unsigned getNumStoreOperands() const = 0; + /// Return the VPValues stored by this interleave group. If it is a load /// interleave group, return an empty ArrayRef. ArrayRef<VPValue *> getStoredValues() const { - // The first operand is the address, followed by the stored values, followed - // by an optional mask. - return ArrayRef<VPValue *>(op_begin(), getNumOperands()) - .slice(1, getNumStoreOperands()); + return ArrayRef<VPValue *>(op_end() - + (getNumStoreOperands() + (HasMask ? 1 : 0)), + getNumStoreOperands()); + } +}; + +/// VPInterleaveRecipe is a recipe for transforming an interleave group of load +/// or stores into one wide load/store and shuffles. The first operand of a +/// VPInterleave recipe is the address, followed by the stored values, followed +/// by an optional mask. +class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase { +public: + VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, + ArrayRef<VPValue *> StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL) + : VPInterleaveBase(VPDef::VPInterleaveSC, IG, Addr, StoredValues, Mask, + NeedsMaskForGaps, MD, DL) {} + + ~VPInterleaveRecipe() override = default; + + VPInterleaveRecipe *clone() override { + return new VPInterleaveRecipe(getInterleaveGroup(), getAddr(), + getStoredValues(), getMask(), + needsMaskForGaps(), *this, getDebugLoc()); } + VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; - /// Return the cost of this VPInterleaveRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif - const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + } - /// Returns the number of stored operands of this interleave group. Returns 0 - /// for load interleave groups. - unsigned getNumStoreOperands() const { - return getNumOperands() - (HasMask ? 2 : 1); + unsigned getNumStoreOperands() const override { + return getNumOperands() - (getMask() ? 2 : 1); } +}; + +/// A recipe for interleaved memory operations with vector-predication +/// intrinsics. The first operand is the address, the second operand is the +/// explicit vector length. Stored values and mask are optional operands. +class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase { +public: + VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask) + : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(), + ArrayRef<VPValue *>({R.getAddr(), &EVL}), + R.getStoredValues(), Mask, R.needsMaskForGaps(), R, + R.getDebugLoc()) { + assert(!getInterleaveGroup()->isReverse() && + "Reversed interleave-group with tail folding is not supported."); + assert(!needsMaskForGaps() && "Interleaved access with gap mask is not " + "supported for scalable vector."); + } + + ~VPInterleaveEVLRecipe() override = default; + + VPInterleaveEVLRecipe *clone() override { + llvm_unreachable("cloning not implemented yet"); + } + + VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC) + + /// The VPValue of the explicit vector length. + VPValue *getEVL() const { return getOperand(1); } - /// The recipe only uses the first lane of the address. + /// Generate the wide load or store, and shuffles. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// The recipe only uses the first lane of the address, and EVL operand. bool onlyFirstLaneUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) || + Op == getEVL(); } - Instruction *getInsertPos() const { return IG->getInsertPos(); } + unsigned getNumStoreOperands() const override { + return getNumOperands() - (getMask() ? 3 : 2); + } }; /// A recipe to represent inloop reduction operations, performing a reduction on @@ -2561,14 +2677,14 @@ protected: public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} @@ -2686,7 +2802,7 @@ public: class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe { public: VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe( VPDef::VPReductionEVLSC, R.getRecurrenceKind(), R.getFastMathFlags(), @@ -3537,7 +3653,8 @@ public: InductionOpcode(Opcode) {} VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV, - VPValue *Step, VPValue *VF, DebugLoc DL = {}) + VPValue *Step, VPValue *VF, + DebugLoc DL = DebugLoc::getUnknown()) : VPScalarIVStepsRecipe( IV, Step, VF, IndDesc.getInductionOpcode(), dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()) @@ -4142,7 +4259,7 @@ public: /// Returns an iterator range over all VFs of the plan. iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() const { - return {VFs.begin(), VFs.end()}; + return VFs; } bool hasScalarVFOnly() const { @@ -4299,9 +4416,8 @@ public: /// via the other early exit). bool hasEarlyExit() const { return count_if(ExitBlocks, - [](VPIRBasicBlock *EB) { - return EB->getNumPredecessors() != 0; - }) > 1 || + [](VPIRBasicBlock *EB) { return EB->hasPredecessors(); }) > + 1 || (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1); } @@ -4309,7 +4425,7 @@ public: /// that this relies on unneeded branches to the scalar tail loop being /// removed. bool hasScalarTail() const { - return !(getScalarPreheader()->getNumPredecessors() == 0 || + return !(!getScalarPreheader()->hasPredecessors() || getScalarPreheader()->getSinglePredecessor() == getEntry()); } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 747c6623aa22..d400ceff7797 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -296,7 +296,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) - .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) { + .Case<VPInterleaveBase>([V](const auto *R) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 80b48de57b40..cef91c15dd87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -193,6 +193,9 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, } if (auto *SI = dyn_cast<SwitchInst>(Inst)) { + // Don't emit recipes for unconditional switch instructions. + if (SI->getNumCases() == 0) + continue; SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())}; for (auto Case : SI->cases()) Ops.push_back(getOrCreateVPOperand(Case.getCaseValue())); @@ -538,8 +541,7 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, } void VPlanTransforms::handleEarlyExits(VPlan &Plan, - bool HasUncountableEarlyExit, - VFRange &Range) { + bool HasUncountableEarlyExit) { auto *MiddleVPBB = cast<VPBasicBlock>( Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]); auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor()); @@ -559,8 +561,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan, assert(!HandledUncountableEarlyExit && "can handle exactly one uncountable early exit"); handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan, - cast<VPBasicBlock>(HeaderVPB), LatchVPBB, - Range); + cast<VPBasicBlock>(HeaderVPB), LatchVPBB); HandledUncountableEarlyExit = true; } else { for (VPRecipeBase &R : EB->phis()) @@ -671,6 +672,90 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, } } +void VPlanTransforms::addMinimumIterationCheck( + VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, + bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, + const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) { + // Generate code to check if the loop's trip count is less than VF * UF, or + // equal to it in case a scalar epilogue is required; this implies that the + // vector trip count is zero. This check also covers the case where adding one + // to the backedge-taken count overflowed leading to an incorrect trip count + // of zero. In this case we will also jump to the scalar loop. + CmpInst::Predicate CmpPred = + RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + // If tail is to be folded, vector loop takes care of all iterations. + VPValue *TripCountVPV = Plan.getTripCount(); + const SCEV *TripCount = vputils::getSCEVExprForVPValue(TripCountVPV, SE); + Type *TripCountTy = TripCount->getType(); + auto GetMinTripCount = [&]() -> const SCEV * { + // Compute max(MinProfitableTripCount, UF * VF) and return it. + const SCEV *VFxUF = + SE.getElementCount(TripCountTy, (VF * UF), SCEV::FlagNUW); + if (UF * VF.getKnownMinValue() >= + MinProfitableTripCount.getKnownMinValue()) { + // TODO: SCEV should be able to simplify test. + return VFxUF; + } + const SCEV *MinProfitableTripCountSCEV = + SE.getElementCount(TripCountTy, MinProfitableTripCount, SCEV::FlagNUW); + return SE.getUMaxExpr(MinProfitableTripCountSCEV, VFxUF); + }; + + VPBasicBlock *EntryVPBB = Plan.getEntry(); + VPBuilder Builder(EntryVPBB); + VPValue *TripCountCheck = Plan.getFalse(); + const SCEV *Step = GetMinTripCount(); + if (TailFolded) { + if (CheckNeededWithTailFolding) { + // vscale is not necessarily a power-of-2, which means we cannot guarantee + // an overflow to zero when updating induction variables and so an + // additional overflow check is required before entering the vector loop. + + // Get the maximum unsigned value for the type. + VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get( + TripCountTy, cast<IntegerType>(TripCountTy)->getMask())); + VPValue *DistanceToMax = Builder.createNaryOp( + Instruction::Sub, {MaxUIntTripCount, TripCountVPV}, + DebugLoc::getUnknown()); + + // Don't execute the vector loop if (UMax - n) < (VF * UF). + // FIXME: Should only check VF * UF, but currently checks Step=max(VF*UF, + // minProfitableTripCount). + TripCountCheck = Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax, + Builder.createExpandSCEV(Step), DL); + } else { + // TripCountCheck = false, folding tail implies positive vector trip + // count. + } + } else { + // TODO: Emit unconditional branch to vector preheader instead of + // conditional branch with known condition. + TripCount = SE.applyLoopGuards(TripCount, OrigLoop); + // Check if the trip count is < the step. + if (SE.isKnownPredicate(CmpPred, TripCount, Step)) { + // TODO: Ensure step is at most the trip count when determining max VF and + // UF, w/o tail folding. + TripCountCheck = Plan.getTrue(); + } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(CmpPred), + TripCount, Step)) { + // Generate the minimum iteration check only if we cannot prove the + // check is known to be true, or known to be false. + VPValue *MinTripCountVPV = Builder.createExpandSCEV(Step); + TripCountCheck = Builder.createICmp( + CmpPred, TripCountVPV, MinTripCountVPV, DL, "min.iters.check"); + } // else step known to be < trip count, use TripCountCheck preset to false. + } + VPInstruction *Term = + Builder.createNaryOp(VPInstruction::BranchOnCond, {TripCountCheck}, DL); + if (MinItersBypassWeights) { + MDBuilder MDB(Plan.getContext()); + MDNode *BranchWeights = MDB.createBranchWeights( + ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false); + Term->addMetadata(LLVMContext::MD_prof, BranchWeights); + } +} + bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>( diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 1ec6ae677374..109156c1469c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -145,6 +145,16 @@ inline int_pred_ty<is_all_ones> m_AllOnes() { return int_pred_ty<is_all_ones>(); } +struct is_zero_int { + bool isValue(const APInt &C) const { return C.isZero(); } +}; + +/// Match an integer 0 or a vector with all elements equal to 0. +/// For vectors, this includes constants with undefined elements. +inline int_pred_ty<is_zero_int> m_ZeroInt() { + return int_pred_ty<is_zero_int>(); +} + /// Matching combinators template <typename LTy, typename RTy> struct match_combine_or { LTy L; @@ -218,9 +228,12 @@ struct Recipe_match { if ((!matchRecipeAndOpcode<RecipeTys>(R) && ...)) return false; - assert(R->getNumOperands() == std::tuple_size<Ops_t>::value && - "recipe with matched opcode does not have the expected number of " - "operands"); + if (R->getNumOperands() != std::tuple_size<Ops_t>::value) { + assert(Opcode == Instruction::PHI && + "non-variadic recipe with matched opcode does not have the " + "expected number of operands"); + return false; + } auto IdxSeq = std::make_index_sequence<std::tuple_size<Ops_t>::value>(); if (all_of_tuple_elements(IdxSeq, [R](auto Op, unsigned Idx) { @@ -302,14 +315,21 @@ m_Broadcast(const Op0_t &Op0) { } template <typename Op0_t> +inline VPInstruction_match<VPInstruction::ExplicitVectorLength, Op0_t> +m_EVL(const Op0_t &Op0) { + return m_VPInstruction<VPInstruction::ExplicitVectorLength>(Op0); +} + +template <typename Op0_t> inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t> m_ExtractLastElement(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0); } -template <typename Op0_t, typename Op1_t> -inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t> -m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { - return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1); + +template <typename Op0_t, typename Op1_t, typename Op2_t> +inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t> +m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2); } template <typename Op0_t, typename Op1_t> @@ -345,6 +365,12 @@ m_ZExtOrSExt(const Op0_t &Op0) { return m_CombineOr(m_ZExt(Op0), m_SExt(Op0)); } +template <typename Op0_t> +inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>, Op0_t> +m_ZExtOrSelf(const Op0_t &Op0) { + return m_CombineOr(m_ZExt(Op0), Op0); +} + template <unsigned Opcode, typename Op0_t, typename Op1_t> inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0, const Op1_t &Op1) { @@ -381,6 +407,13 @@ m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) { return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1); } +/// Match a binary AND operation. +template <typename Op0_t, typename Op1_t> +inline AllRecipe_commutative_match<Instruction::And, Op0_t, Op1_t> +m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1) { + return m_c_Binary<Instruction::And, Op0_t, Op1_t>(Op0, Op1); +} + /// Match a binary OR operation. Note that while conceptually the operands can /// be matched commutatively, \p Commutative defaults to false in line with the /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index cdadc33e3088..0c27d535b680 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -14,11 +14,13 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanCFG.h" +#include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" #include "llvm/ADT/PostOrderIterator.h" using namespace llvm; +using namespace VPlanPatternMatch; namespace { class VPPredicator { @@ -246,6 +248,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { "Distinct incoming values with one having a full mask"); break; } + OperandsWithMask.push_back(EdgeMask); } PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c4fdcccc6d62..bf5148954309 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -52,8 +52,9 @@ bool VPRecipeBase::mayWriteToMemory() const { return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); + case VPInterleaveEVLSC: case VPInterleaveSC: - return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; + return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: case VPWidenStoreSC: return true; @@ -142,6 +143,7 @@ bool VPRecipeBase::mayReadFromMemory() const { return false; } default: + // FIXME: Return false if the recipe represents an interleaved store. return true; } } @@ -183,6 +185,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { "underlying instruction has side-effects"); return false; } + case VPInterleaveEVLSC: case VPInterleaveSC: return mayWriteToMemory(); case VPWidenLoadEVLSC: @@ -255,7 +258,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { Instruction *UI = nullptr; if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); - else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) + else if (auto *IG = dyn_cast<VPInterleaveBase>(this)) UI = IG->getInsertPos(); else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) UI = &WidenMem->getIngredient(); @@ -389,6 +392,42 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPIRFlags::intersectFlags(const VPIRFlags &Other) { + assert(OpType == Other.OpType && "OpType must match"); + switch (OpType) { + case OperationType::OverflowingBinOp: + WrapFlags.HasNUW &= Other.WrapFlags.HasNUW; + WrapFlags.HasNSW &= Other.WrapFlags.HasNSW; + break; + case OperationType::Trunc: + TruncFlags.HasNUW &= Other.TruncFlags.HasNUW; + TruncFlags.HasNSW &= Other.TruncFlags.HasNSW; + break; + case OperationType::DisjointOp: + DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint; + break; + case OperationType::PossiblyExactOp: + ExactFlags.IsExact &= Other.ExactFlags.IsExact; + break; + case OperationType::GEPOp: + GEPFlags &= Other.GEPFlags; + break; + case OperationType::FPMathOp: + FMFs.NoNaNs &= Other.FMFs.NoNaNs; + FMFs.NoInfs &= Other.FMFs.NoInfs; + break; + case OperationType::NonNegOp: + NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg; + break; + case OperationType::Cmp: + assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate"); + break; + case OperationType::Other: + assert(AllFlags == Other.AllFlags && "Cannot drop other flags"); + break; + } +} + FastMathFlags VPIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); @@ -471,7 +510,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::ICmp: case Instruction::FCmp: case Instruction::Store: - case VPInstruction::ActiveLaneMask: case VPInstruction::BranchOnCount: case VPInstruction::ComputeReductionResult: case VPInstruction::FirstOrderRecurrenceSplice: @@ -481,6 +519,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::WideIVStep: return 2; case Instruction::Select: + case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: return 3; @@ -620,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) { Name); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); + auto PredTy = VectorType::get( + Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue()) + ->getZExtValue()); return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); @@ -875,9 +916,9 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); } case VPInstruction::AnyOf: { - Value *Res = State.get(getOperand(0)); + Value *Res = Builder.CreateFreeze(State.get(getOperand(0))); for (VPValue *Op : drop_begin(operands())) - Res = Builder.CreateOr(Res, State.get(Op)); + Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op))); return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); } case VPInstruction::ExtractLane: { @@ -919,8 +960,15 @@ Value *VPInstruction::generate(VPTransformState &State) { unsigned LastOpIdx = getNumOperands() - 1; Value *Res = nullptr; for (int Idx = LastOpIdx; Idx >= 0; --Idx) { - Value *TrailingZeros = Builder.CreateCountTrailingZeroElems( - Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name); + Value *TrailingZeros = + State.VF.isScalar() + ? Builder.CreateZExt( + Builder.CreateICmpEQ(State.get(getOperand(Idx)), + Builder.getFalse()), + Builder.getInt64Ty()) + : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), + State.get(getOperand(Idx)), + true, Name); Value *Current = Builder.CreateAdd( Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros); if (Res) { @@ -1027,8 +1075,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } switch (getOpcode()) { + case Instruction::Select: { + // TODO: It may be possible to improve this by analyzing where the + // condition operand comes from. + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + auto *CondTy = Ctx.Types.inferScalarType(getOperand(0)); + auto *VecTy = Ctx.Types.inferScalarType(getOperand(1)); + if (!vputils::onlyFirstLaneUsed(this)) { + CondTy = toVectorTy(CondTy, VF); + VecTy = toVectorTy(VecTy, VF); + } + return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred, + Ctx.CostKind); + } case Instruction::ExtractElement: case VPInstruction::ExtractLane: { + if (VF.isScalar()) { + // ExtractLane with VF=1 takes care of handling extracting across multiple + // parts. + return 0; + } + // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, @@ -1040,8 +1107,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind); } case VPInstruction::FirstActiveLane: { + Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0)); + if (VF.isScalar()) + return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy), + CmpInst::ICMP_EQ, Ctx.CostKind); // Calculate the cost of determining the lane index. - auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + auto *PredTy = toVectorTy(ScalarTy, VF); IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx), {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); @@ -1060,7 +1132,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } case VPInstruction::ActiveLaneMask: { Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); - Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); + unsigned Multiplier = + cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue(); + Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier); IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, {ArgTy, ArgTy}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); @@ -1684,18 +1758,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { State.set(this, V); } -InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R. +static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, + ArrayRef<const VPValue *> Operands, + const VPRecipeWithIRFlags &R, + ElementCount VF, + VPCostContext &Ctx) { // Some backends analyze intrinsic arguments to determine cost. Use the // underlying value for the operand if it has one. Otherwise try to use the // operand of the underlying call instruction, if there is one. Otherwise // clear Arguments. // TODO: Rework TTI interface to be independent of concrete IR values. SmallVector<const Value *> Arguments; - for (const auto &[Idx, Op] : enumerate(operands())) { + for (const auto &[Idx, Op] : enumerate(Operands)) { auto *V = Op->getUnderlyingValue(); if (!V) { - if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { + if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) { Arguments.push_back(UI->getArgOperand(Idx)); continue; } @@ -1705,21 +1783,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, Arguments.push_back(V); } - Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF); + Type *ScalarRetTy = Ctx.Types.inferScalarType(&R); + Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy; SmallVector<Type *> ParamTys; - for (unsigned I = 0; I != getNumOperands(); ++I) - ParamTys.push_back( - toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); + for (const VPValue *Op : Operands) { + ParamTys.push_back(VF.isVector() + ? toVectorTy(Ctx.Types.inferScalarType(Op), VF) + : Ctx.Types.inferScalarType(Op)); + } // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. - FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); + FastMathFlags FMF = + R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( - VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, - dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()), + ID, RetTy, Arguments, ParamTys, FMF, + dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()), InstructionCost::getInvalid(), &Ctx.TLI); return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); } +InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + SmallVector<const VPValue *> ArgOps(operands()); + return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx); +} + StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } @@ -2110,8 +2198,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: - // More complex computation, let the legacy cost-model handle this for now. - return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); + // If the div/rem operation isn't safe to speculate and requires + // predication, then the only way we can even create a vplan is to insert + // a select on the second input operand to ensure we use the value of 1 + // for the inactive lanes. The select will be costed separately. case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: @@ -2174,7 +2264,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { if (VF.isScalar()) return TTI::CastContextHint::Normal; - if (isa<VPInterleaveRecipe>(R)) + if (isa<VPInterleaveBase>(R)) return TTI::CastContextHint::Interleave; if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked @@ -2756,10 +2846,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); assert(RedTy->isIntegerTy() && "VPExpressionRecipe only supports integer types currently."); + unsigned Opcode = RecurrenceDescriptor::getOpcode( + cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind()); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { - unsigned Opcode = RecurrenceDescriptor::getOpcode( - cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind()); return Ctx.TTI.getExtendedReductionCost( Opcode, cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == @@ -2767,13 +2857,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); } case ExpressionTypes::MulAccReduction: - return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind); + return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, + Ctx.CostKind); case ExpressionTypes::ExtMulAccReduction: return Ctx.TTI.getMulAccReductionCost( cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, - RedTy, SrcVecTy, Ctx.CostKind); + Opcode, RedTy, SrcVecTy, Ctx.CostKind); } llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); } @@ -3014,23 +3105,75 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // instruction cost. return 0; case Instruction::Call: { - if (!isSingleScalar()) { - // TODO: Handle remaining call costs here as well. - if (VF.isScalable()) - return InstructionCost::getInvalid(); - break; - } - auto *CalledFn = cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); - if (CalledFn->isIntrinsic()) - break; + SmallVector<const VPValue *> ArgOps(drop_end(operands())); SmallVector<Type *, 4> Tys; - for (VPValue *ArgOp : drop_end(operands())) + for (const VPValue *ArgOp : ArgOps) Tys.push_back(Ctx.Types.inferScalarType(ArgOp)); + + if (CalledFn->isIntrinsic()) + // Various pseudo-intrinsics with costs of 0 are scalarized instead of + // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early. + switch (CalledFn->getIntrinsicID()) { + case Intrinsic::assume: + case Intrinsic::lifetime_end: + case Intrinsic::lifetime_start: + case Intrinsic::sideeffect: + case Intrinsic::pseudoprobe: + case Intrinsic::experimental_noalias_scope_decl: { + assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx) == 0 && + "scalarizing intrinsic should be free"); + return InstructionCost(0); + } + default: + break; + } + Type *ResultTy = Ctx.Types.inferScalarType(this); - return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + InstructionCost ScalarCallCost = + Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + if (isSingleScalar()) { + if (CalledFn->isIntrinsic()) + ScalarCallCost = std::min( + ScalarCallCost, + getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx)); + return ScalarCallCost; + } + + if (VF.isScalable()) + return InstructionCost::getInvalid(); + + // Compute the cost of scalarizing the result and operands if needed. + InstructionCost ScalarizationCost = 0; + if (VF.isVector()) { + if (!ResultTy->isVoidTy()) { + for (Type *VectorTy : + to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) { + ScalarizationCost += Ctx.TTI.getScalarizationOverhead( + cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()), + /*Insert=*/true, + /*Extract=*/false, Ctx.CostKind); + } + } + // Skip operands that do not require extraction/scalarization and do not + // incur any overhead. + SmallPtrSet<const VPValue *, 4> UniqueOperands; + Tys.clear(); + for (auto *Op : ArgOps) { + if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || + !UniqueOperands.insert(Op).second) + continue; + Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF)); + } + ScalarizationCost += + Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind); + } + + return ScalarCallCost * VF.getFixedValue() + ScalarizationCost; } case Instruction::Add: case Instruction::Sub: @@ -3045,10 +3188,29 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: return *getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx) * (isSingleScalar() ? 1 : VF.getFixedValue()); + case Instruction::Load: + case Instruction::Store: { + if (isSingleScalar()) { + bool IsLoad = UI->getOpcode() == Instruction::Load; + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); + return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); + } + // TODO: See getMemInstScalarizationCost for how to handle replicating and + // predicated cases. + break; } } @@ -3181,10 +3343,17 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, // TODO: Using the original IR may not be accurate. // Currently, ARM will use the underlying IR to calculate gather/scatter // instruction cost. - const Value *Ptr = getLoadStorePointerOperand(&Ingredient); - Type *PtrTy = toVectorTy(Ptr->getType(), VF); assert(!Reverse && "Inconsecutive memory access should not have the order."); + + const Value *Ptr = getLoadStorePointerOperand(&Ingredient); + Type *PtrTy = Ptr->getType(); + + // If the address value is uniform across all lanes, then the address can be + // calculated with scalar type and broadcast. + if (!vputils::isSingleScalar(getAddr())) + PtrTy = toVectorTy(PtrTy, VF); + return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, Ctx.CostKind) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, @@ -3532,9 +3701,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Interleave group being replicated."); - assert((!NeedsMaskForGaps || !State.VF.isScalable()) && + assert((!needsMaskForGaps() || !State.VF.isScalable()) && "Masking gaps for scalable vectors is not yet supported."); - const InterleaveGroup<Instruction> *Group = IG; + const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); Instruction *Instr = Group->getInsertPos(); // Prepare for the vector type of the interleaved load/store. @@ -3574,7 +3743,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Vectorize the interleaved load group. if (isa<LoadInst>(Instr)) { Value *MaskForGaps = nullptr; - if (NeedsMaskForGaps) { + if (needsMaskForGaps()) { MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); @@ -3651,7 +3820,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Vectorize the interleaved store group. Value *MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); - assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) && + assert(((MaskForGaps != nullptr) == needsMaskForGaps()) && "Mismatch between NeedsMaskForGaps and MaskForGaps"); ArrayRef<VPValue *> StoredValues = getStoredValues(); // Collect the stored vector from each member. @@ -3702,6 +3871,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { + const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; @@ -3730,8 +3900,152 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +void VPInterleaveEVLRecipe::execute(VPTransformState &State) { + assert(!State.Lane && "Interleave group being replicated."); + assert(State.VF.isScalable() && + "Only support scalable VF for EVL tail-folding."); + assert(!needsMaskForGaps() && + "Masking gaps for scalable vectors is not yet supported."); + const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); + Instruction *Instr = Group->getInsertPos(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = getLoadStoreType(Instr); + unsigned InterleaveFactor = Group->getFactor(); + assert(InterleaveFactor <= 8 && + "Unsupported deinterleave/interleave factor for scalable vectors"); + ElementCount WideVF = State.VF * InterleaveFactor; + auto *VecTy = VectorType::get(ScalarTy, WideVF); + + VPValue *Addr = getAddr(); + Value *ResAddr = State.get(Addr, VPLane(0)); + Value *EVL = State.get(getEVL(), VPLane(0)); + Value *InterleaveEVL = State.Builder.CreateMul( + EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl", + /* NUW= */ true, /* NSW= */ true); + LLVMContext &Ctx = State.Builder.getContext(); + + Value *GroupMask = nullptr; + if (VPValue *BlockInMask = getMask()) { + SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask)); + GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask"); + } else { + GroupMask = + State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue()); + } + + // Vectorize the interleaved load group. + if (isa<LoadInst>(Instr)) { + CallInst *NewLoad = State.Builder.CreateIntrinsic( + VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr, + "wide.vp.load"); + NewLoad->addParamAttr(0, + Attribute::getWithAlignment(Ctx, Group->getAlign())); + + applyMetadata(*NewLoad); + // TODO: Also manage existing metadata using VPIRMetadata. + Group->addMetadata(NewLoad); + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + NewLoad = State.Builder.CreateIntrinsic( + Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), + NewLoad->getType(), NewLoad, + /*FMFSource=*/nullptr, "strided.vec"); + + const DataLayout &DL = Instr->getDataLayout(); + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + // Skip the gaps in the group. + if (!Member) + continue; + + Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } + + State.set(getVPValue(J), StridedVec); + ++J; + } + return; + } // End for interleaved load. + + // The sub vector type for current instruction. + auto *SubVT = VectorType::get(ScalarTy, State.VF); + // Vectorize the interleaved store group. + ArrayRef<VPValue *> StoredValues = getStoredValues(); + // Collect the stored vector from each member. + SmallVector<Value *, 4> StoredVecs; + const DataLayout &DL = Instr->getDataLayout(); + for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) { + Instruction *Member = Group->getMember(I); + // Skip the gaps in the group. + if (!Member) { + StoredVecs.push_back(PoisonValue::get(SubVT)); + continue; + } + + Value *StoredVec = State.get(StoredValues[StoredIdx]); + // If this member has different type, cast it to a unified type. + if (StoredVec->getType() != SubVT) + StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); + + StoredVecs.push_back(StoredVec); + ++StoredIdx; + } + + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); + CallInst *NewStore = + State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store, + {IVec, ResAddr, GroupMask, InterleaveEVL}); + NewStore->addParamAttr(1, + Attribute::getWithAlignment(Ctx, Group->getAlign())); + + applyMetadata(*NewStore); + // TODO: Also manage existing metadata using VPIRMetadata. + Group->addMetadata(NewStore); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << ", "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", "; + getEVL()->printAsOperand(O, SlotTracker); + if (VPValue *Mask = getMask()) { + O << ", "; + Mask->printAsOperand(O, SlotTracker); + } + + unsigned OpIdx = 0; + for (unsigned i = 0; i < IG->getFactor(); ++i) { + if (!IG->getMember(i)) + continue; + if (getNumStoreOperands() > 0) { + O << "\n" << Indent << " vp.store "; + getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); + O << " to index " << i; + } else { + O << "\n" << Indent << " "; + getVPValue(OpIdx)->printAsOperand(O, SlotTracker); + O << " = vp.load from index " << i; + } + ++OpIdx; + } +} +#endif + +InstructionCost VPInterleaveBase::computeCost(ElementCount VF, + VPCostContext &Ctx) const { Instruction *InsertPos = getInsertPos(); // Find the VPValue index of the interleave group. We need to skip gaps. unsigned InsertPosIdx = 0; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e0bf241c73fd..2cac5557daee 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" @@ -39,6 +40,10 @@ using namespace llvm; using namespace VPlanPatternMatch; +cl::opt<bool> EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide get active lane mask instructions")); + bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPlanPtr &Plan, function_ref<const InductionDescriptor *(PHINode *)> @@ -142,7 +147,7 @@ static bool sinkScalarOperands(VPlan &Plan) { for (VPValue *Op : Recipe.operands()) if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert(std::make_pair(VPBB, Def)); + WorkList.insert({VPBB, Def}); } } @@ -206,7 +211,7 @@ static bool sinkScalarOperands(VPlan &Plan) { for (VPValue *Op : SinkCandidate->operands()) if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert(std::make_pair(SinkTo, Def)); + WorkList.insert({SinkTo, Def}); Changed = true; } return Changed; @@ -344,7 +349,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, auto *BlockInMask = PredRecipe->getMask(); auto *MaskDef = BlockInMask->getDefiningRecipe(); auto *BOMRecipe = new VPBranchOnMaskRecipe( - BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc()); + BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown()); auto *Entry = Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); @@ -859,8 +864,8 @@ static VPValue *optimizeLatchExitInductionUser( Type *StepTy = TypeInfo.inferScalarType(Step); auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0)); return B.createPtrAdd(EndValue, - B.createNaryOp(Instruction::Sub, {Zero, Step}), {}, - "ind.escape"); + B.createNaryOp(Instruction::Sub, {Zero, Step}), + DebugLoc::getUnknown(), "ind.escape"); } if (ScalarTy->isFloatingPointTy()) { const auto &ID = WideIV->getInductionDescriptor(); @@ -910,10 +915,10 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { if (!ExpR) continue; - auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR}); - if (I.second) + const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR); + if (Inserted) continue; - ExpR->replaceAllUsesWith(I.first->second); + ExpR->replaceAllUsesWith(V->second); ExpR->eraseFromParent(); } } @@ -1067,7 +1072,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X // && (Y || Z) and (X || !X) into true. This requires queuing newly created // recipes to be visited during simplification. - VPValue *X, *Y; + VPValue *X, *Y, *Z; if (match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) { @@ -1076,13 +1081,37 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - // OR x, 1 -> 1. - if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) { - Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1) - : Def->getOperand(0)); - Def->eraseFromParent(); - return; - } + // x | 1 -> 1 + if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) + return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X)); + + // x | 0 -> x + if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) + return Def->replaceAllUsesWith(X); + + // x & 0 -> 0 + if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) + return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X)); + + // x && false -> false + if (match(Def, m_LogicalAnd(m_VPValue(X), m_False()))) + return Def->replaceAllUsesWith(Def->getOperand(1)); + + // (x && y) || (x && z) -> x && (y || z) + VPBuilder Builder(Def); + if (match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_LogicalAnd(m_Deferred(X), m_VPValue(Z)))) && + // Simplify only if one of the operands has one use to avoid creating an + // extra recipe. + (!Def->getOperand(0)->hasMoreThanOneUniqueUser() || + !Def->getOperand(1)->hasMoreThanOneUniqueUser())) + return Def->replaceAllUsesWith( + Builder.createLogicalAnd(X, Builder.createOr(Y, Z))); + + // x && !x -> 0 + if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) + return Def->replaceAllUsesWith(Plan->getOrAddLiveIn( + ConstantInt::getFalse(VPTypeAnalysis(*Plan).inferScalarType(Def)))); if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) return Def->replaceAllUsesWith(X); @@ -1096,6 +1125,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With + // tail folding it is likely that x is a header mask and can be simplified + // further. + if (match(Def, m_LogicalAnd(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_VPValue(Z))) && + X->hasMoreThanOneUniqueUser()) + return Def->replaceAllUsesWith( + Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z))); + if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return Def->replaceAllUsesWith(A); @@ -1150,7 +1188,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { m_VPValue(X), m_SpecificInt(1)))) { Type *WideStepTy = TypeInfo.inferScalarType(Def); if (TypeInfo.inferScalarType(X) != WideStepTy) - X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy); + X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy); Def->replaceAllUsesWith(X); return; } @@ -1232,11 +1270,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - VPInstruction *OpVPI; - if (match(Def, m_ExtractLastElement(m_VPInstruction(OpVPI))) && - OpVPI->isVectorToScalar()) { - Def->replaceAllUsesWith(OpVPI); - return; + if (match(Def, + m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) && + vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) { + return U->usesScalars(A) || Def == U; + })) { + return Def->replaceAllUsesWith(A); } } @@ -1269,11 +1308,29 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { continue; auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R); + if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) && + vputils::isSingleScalar(RepR->getOperand(1))) { + auto *Clone = new VPReplicateRecipe( + RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(), + true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/); + Clone->insertBefore(RepOrWidenR); + auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement, + {Clone->getOperand(0)}); + Ext->insertBefore(Clone); + Clone->setOperand(0, Ext); + RepR->eraseFromParent(); + continue; + } + // Skip recipes that aren't single scalars or don't have only their // scalar results used. In the latter case, we would introduce extra // broadcasts. if (!vputils::isSingleScalar(RepOrWidenR) || - !vputils::onlyScalarValuesUsed(RepOrWidenR)) + !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) { + return U->usesScalars(RepOrWidenR) || + match(cast<VPRecipeBase>(U), + m_ExtractLastElement(m_VPValue())); + })) continue; auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(), @@ -1285,6 +1342,23 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { } } +/// Try to see if all of \p Blend's masks share a common value logically and'ed +/// and remove it from the masks. +static void removeCommonBlendMask(VPBlendRecipe *Blend) { + if (Blend->isNormalized()) + return; + VPValue *CommonEdgeMask; + if (!match(Blend->getMask(0), + m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue()))) + return; + for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++) + if (!match(Blend->getMask(I), + m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue()))) + return; + for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++) + Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1)); +} + /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes /// to make sure the masks are simplified. static void simplifyBlends(VPlan &Plan) { @@ -1295,6 +1369,8 @@ static void simplifyBlends(VPlan &Plan) { if (!Blend) continue; + removeCommonBlendMask(Blend); + // Try to remove redundant blend recipes. SmallPtrSet<VPValue *, 4> UniqueValues; if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) @@ -1467,6 +1543,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C); } +/// Try to replace multiple active lane masks used for control flow with +/// a single, wide active lane mask instruction followed by multiple +/// extract subvector intrinsics. This applies to the active lane mask +/// instructions both in the loop and in the preheader. +/// Incoming values of all ActiveLaneMaskPHIs are updated to use the +/// new extracts from the first active lane mask, which has it's last +/// operand (multiplier) set to UF. +static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, + unsigned UF) { + if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1) + return false; + + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); + auto *Term = &ExitingVPBB->back(); + + using namespace llvm::VPlanPatternMatch; + if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue()))))) + return false; + + auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); + LLVMContext &Ctx = Plan.getContext(); + + auto ExtractFromALM = [&](VPInstruction *ALM, + SmallVectorImpl<VPValue *> &Extracts) { + DebugLoc DL = ALM->getDebugLoc(); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<VPValue *> Ops; + Ops.append({ALM, Plan.getOrAddLiveIn( + ConstantInt::get(IntegerType::getInt64Ty(Ctx), + VF.getKnownMinValue() * Part))}); + auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops, + IntegerType::getInt1Ty(Ctx), DL); + Extracts[Part] = Ext; + Ext->insertAfter(ALM); + } + }; + + // Create a list of each active lane mask phi, ordered by unroll part. + SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr); + for (VPRecipeBase &R : Header->phis()) { + auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R); + if (!Phi) + continue; + VPValue *Index = nullptr; + match(Phi->getBackedgeValue(), + m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue())); + assert(Index && "Expected index from ActiveLaneMask instruction"); + + auto *II = dyn_cast<VPInstruction>(Index); + if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) { + auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue()); + Phis[Part->getZExtValue()] = Phi; + } else + // Anything other than a CanonicalIVIncrementForPart is part 0 + Phis[0] = Phi; + } + + assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && + "Expected one VPActiveLaneMaskPHIRecipe for each unroll part"); + + auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue()); + auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue()); + + assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask && + LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) && + "Expected incoming values of Phi to be ActiveLaneMasks"); + + // When using wide lane masks, the return type of the get.active.lane.mask + // intrinsic is VF x UF (last operand). + VPValue *ALMMultiplier = + Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + EntryALM->setOperand(2, ALMMultiplier); + LoopALM->setOperand(2, ALMMultiplier); + + // Create UF x extract vectors and insert into preheader. + SmallVector<VPValue *> EntryExtracts(UF); + ExtractFromALM(EntryALM, EntryExtracts); + + // Create UF x extract vectors and insert before the loop compare & branch, + // updating the compare to use the first extract. + SmallVector<VPValue *> LoopExtracts(UF); + ExtractFromALM(LoopALM, LoopExtracts); + VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0)); + Not->setOperand(0, LoopExtracts[0]); + + // Update the incoming values of active lane mask phis. + for (unsigned Part = 0; Part < UF; ++Part) { + Phis[Part]->setStartValue(EntryExtracts[Part]); + Phis[Part]->setBackedgeValue(LoopExtracts[Part]); + } + + return true; +} + /// Try to simplify the branch condition of \p Plan. This may restrict the /// resulting plan to \p BestVF and \p BestUF. static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, @@ -1478,8 +1650,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, VPValue *Cond; ScalarEvolution &SE = *PSE.getSE(); if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || - match(Term, m_BranchOnCond( - m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { + match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue()))))) { // Try to simplify the branch condition if TC <= VF * UF when the latch // terminator is BranchOnCount or BranchOnCond where the input is // Not(ActiveLaneMask). @@ -1558,8 +1730,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - bool MadeChange = - simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); + bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF); + MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); if (MadeChange) { @@ -1792,6 +1964,110 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { } } +namespace { +struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> { + static bool isSentinel(const VPSingleDefRecipe *Def) { + return Def == getEmptyKey() || Def == getTombstoneKey(); + } + + /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R. + /// Returns an optional pair, where the first element indicates whether it is + /// an intrinsic ID. + static std::optional<std::pair<bool, unsigned>> + getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { + return TypeSwitch<const VPSingleDefRecipe *, + std::optional<std::pair<bool, unsigned>>>(R) + .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, + VPWidenSelectRecipe, VPReplicateRecipe>( + [](auto *I) { return std::make_pair(false, I->getOpcode()); }) + .Case<VPWidenIntrinsicRecipe>([](auto *I) { + return std::make_pair(true, I->getVectorIntrinsicID()); + }) + .Default([](auto *) { return std::nullopt; }); + } + + /// Returns true if recipe \p Def can be safely handed for CSE. + static bool canHandle(const VPSingleDefRecipe *Def) { + // We can extend the list of handled recipes in the future, + // provided we account for the data embedded in them while checking for + // equality or hashing. + auto C = getOpcodeOrIntrinsicID(Def); + + // The issue with (Insert|Extract)Value is that the index of the + // insert/extract is not a proper operand in LLVM IR, and hence also not in + // VPlan. + if (!C || (!C->first && (C->second == Instruction::InsertValue || + C->second == Instruction::ExtractValue))) + return false; + + // During CSE, we can only handle recipes that don't read from memory: if + // they read from memory, there could be an intervening write to memory + // before the next instance is CSE'd, leading to an incorrect result. + return !Def->mayReadFromMemory(); + } + + /// Hash the underlying data of \p Def. + static unsigned getHashValue(const VPSingleDefRecipe *Def) { + const VPlan *Plan = Def->getParent()->getPlan(); + VPTypeAnalysis TypeInfo(*Plan); + hash_code Result = hash_combine( + Def->getVPDefID(), getOpcodeOrIntrinsicID(Def), + TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def), + hash_combine_range(Def->operands())); + if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def)) + if (RFlags->hasPredicate()) + return hash_combine(Result, RFlags->getPredicate()); + return Result; + } + + /// Check equality of underlying data of \p L and \p R. + static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) { + if (isSentinel(L) || isSentinel(R)) + return L == R; + if (L->getVPDefID() != R->getVPDefID() || + getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) || + vputils::isSingleScalar(L) != vputils::isSingleScalar(R) || + !equal(L->operands(), R->operands())) + return false; + if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L)) + if (LFlags->hasPredicate() && + LFlags->getPredicate() != + cast<VPRecipeWithIRFlags>(R)->getPredicate()) + return false; + const VPlan *Plan = L->getParent()->getPlan(); + VPTypeAnalysis TypeInfo(*Plan); + return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R); + } +}; +} // end anonymous namespace + +/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p +/// Plan. +void VPlanTransforms::cse(VPlan &Plan) { + VPDominatorTree VPDT(Plan); + DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap; + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_deep(Plan.getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + auto *Def = dyn_cast<VPSingleDefRecipe>(&R); + if (!Def || !VPCSEDenseMapInfo::canHandle(Def)) + continue; + if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) { + // V must dominate Def for a valid replacement. + if (!VPDT.dominates(V->getParent(), VPBB)) + continue; + // Only keep flags present on both V and Def. + if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V)) + RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def)); + Def->replaceAllUsesWith(V); + continue; + } + CSEMap[Def] = Def; + } + } +} + /// Move loop-invariant recipes out of the vector loop region in \p Plan. static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); @@ -1953,10 +2229,10 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeRedundantInductionCasts, Plan); runPass(simplifyRecipes, Plan); - runPass(simplifyBlends, Plan); runPass(removeDeadRecipes, Plan); - runPass(narrowToSingleScalarRecipes, Plan); + runPass(simplifyBlends, Plan); runPass(legalizeAndOptimizeInductions, Plan); + runPass(narrowToSingleScalarRecipes, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); runPass(simplifyRecipes, Plan); runPass(removeBranchOnConst, Plan); @@ -2042,13 +2318,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - auto *EntryALM = - Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, - DL, "active.lane.mask.entry"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {EntryIncrement, TC, ALMMultiplier}, DL, + "active.lane.mask.entry"); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. - auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + auto *LaneMaskPhi = + new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown()); LaneMaskPhi->insertAfter(CanonicalIVPHI); // Create the active lane mask for the next iteration of the loop before the @@ -2059,8 +2338,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, {false, false}, DL); auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, - {InLoopIncrement, TripCount}, DL, - "active.lane.mask.next"); + {InLoopIncrement, TripCount, ALMMultiplier}, + DL, "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); // Replace the original terminator with BranchOnCond. We have to invert the @@ -2077,12 +2356,10 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( /// for the header-mask pattern manually. static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) { SmallVector<VPValue *> WideCanonicalIVs; - auto *FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); + auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), + IsaPred<VPWidenCanonicalIVRecipe>); assert(count_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <= - 1 && + IsaPred<VPWidenCanonicalIVRecipe>) <= 1 && "Must have at most one VPWideCanonicalIVRecipe"); if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { auto *WideCanonicalIV = @@ -2125,9 +2402,8 @@ void VPlanTransforms::addActiveLaneMask( "DataAndControlFlowWithoutRuntimeCheck implies " "UseActiveLaneMaskForControlFlow"); - auto *FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); + auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), + IsaPred<VPWidenCanonicalIVRecipe>); assert(FoundWidenCanonicalIVUser && "Must have widened canonical IV when tail folding!"); VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan); @@ -2139,9 +2415,12 @@ void VPlanTransforms::addActiveLaneMask( Plan, DataAndControlFlowWithoutRuntimeCheck); } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); - LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, - {WideCanonicalIV, Plan.getTripCount()}, nullptr, - "active.lane.mask"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + LaneMask = + B.createNaryOp(VPInstruction::ActiveLaneMask, + {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, + nullptr, "active.lane.mask"); } // Walk users of WideCanonicalIV and replace the header mask of the form @@ -2205,6 +2484,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPValue *NewAddr = GetNewAddr(S->getAddr()); return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); }) + .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { + VPValue *NewMask = GetNewMask(IR->getMask()); + return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); + }) .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { VPValue *NewMask = GetNewMask(Red->getCondOp()); return new VPReductionEVLRecipe(*Red, EVL, NewMask); @@ -2271,11 +2554,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPBuilder Builder(LoopRegion->getPreheaderVPBB()); MaxEVL = Builder.createScalarZExtOrTrunc( MaxEVL, Type::getInt32Ty(Plan.getContext()), - TypeInfo.inferScalarType(MaxEVL), DebugLoc()); + TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown()); Builder.setInsertPoint(Header, Header->getFirstNonPhi()); - VPValue *PrevEVL = - Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl"); + VPValue *PrevEVL = Builder.createScalarPhi( + {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) { @@ -2327,16 +2610,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { if (!EVLRecipe) continue; - [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); + unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); assert(NumDefVal == CurRecipe->getNumDefinedValues() && "New recipe must define the same number of values as the " "original."); - assert(NumDefVal <= 1 && - "Only supports recipes with a single definition or without users."); EVLRecipe->insertBefore(CurRecipe); - if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) { - VPValue *CurVPV = CurRecipe->getVPSingleValue(); - CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); + if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>( + EVLRecipe)) { + for (unsigned I = 0; I < NumDefVal; ++I) { + VPValue *CurVPV = CurRecipe->getVPValue(I); + CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I)); + } } ToErase.push_back(CurRecipe); } @@ -2404,7 +2688,7 @@ void VPlanTransforms::addExplicitVectorLength( VPValue *StartV = CanonicalIVPHI->getStartValue(); // Create the ExplicitVectorLengthPhi recipe in the main loop. - auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); + auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown()); EVLPhi->insertAfter(CanonicalIVPHI); VPBuilder Builder(Header, Header->getFirstNonPhi()); // Create the AVL (application vector length), starting from TC -> 0 in steps @@ -2418,10 +2702,11 @@ void VPlanTransforms::addExplicitVectorLength( VPValue *AVLSafe = Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements)); VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); - AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl"); + AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(), + "safe_avl"); } auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, - DebugLoc()); + DebugLoc::getUnknown()); auto *CanonicalIVIncrement = cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); @@ -2473,6 +2758,22 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { VPBasicBlock *HeaderVPBB = EVLPhi->getParent(); VPValue *EVLIncrement = EVLPhi->getBackedgeValue(); + VPValue *AVL; + [[maybe_unused]] bool FoundAVL = + match(EVLIncrement, + m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))); + assert(FoundAVL && "Didn't find AVL?"); + + // The AVL may be capped to a safe distance. + VPValue *SafeAVL; + if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue()))) + AVL = SafeAVL; + + VPValue *AVLNext; + [[maybe_unused]] bool FoundAVLNext = + match(AVL, m_VPInstruction<Instruction::PHI>( + m_Specific(Plan.getTripCount()), m_VPValue(AVLNext))); + assert(FoundAVLNext && "Didn't find AVL backedge?"); // Convert EVLPhi to concrete recipe. auto *ScalarR = @@ -2496,7 +2797,7 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { // Replace the use of VectorTripCount in the latch-exiting block. // Before: (branch-on-count EVLIVInc, VectorTripCount) - // After: (branch-on-count EVLIVInc, TripCount) + // After: (branch-on-cond eq AVLNext, 0) VPBasicBlock *LatchExiting = HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock(); @@ -2509,7 +2810,54 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { m_BranchOnCount(m_VPValue(EVLIncrement), m_Specific(&Plan.getVectorTripCount()))) && "Unexpected terminator in EVL loop"); - LatchExitingBr->setOperand(1, Plan.getTripCount()); + + Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext); + VPBuilder Builder(LatchExitingBr); + VPValue *Cmp = + Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, + Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy))); + Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp); + LatchExitingBr->eraseFromParent(); +} + +void VPlanTransforms::replaceSymbolicStrides( + VPlan &Plan, PredicatedScalarEvolution &PSE, + const DenseMap<Value *, const SCEV *> &StridesMap) { + // Replace VPValues for known constant strides guaranteed by predicate scalar + // evolution. + auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { + auto *R = cast<VPRecipeBase>(&U); + return R->getParent()->getParent() || + R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); + }; + for (const SCEV *Stride : StridesMap.values()) { + using namespace SCEVPatternMatch; + auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); + const APInt *StrideConst; + if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst))) + // Only handle constant strides for now. + continue; + + auto *CI = + Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst)); + if (VPValue *StrideVPV = Plan.getLiveIn(StrideV)) + StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); + + // The versioned value may not be used in the loop directly but through a + // sext/zext. Add new live-ins in those cases. + for (Value *U : StrideV->users()) { + if (!isa<SExtInst, ZExtInst>(U)) + continue; + VPValue *StrideVPV = Plan.getLiveIn(U); + if (!StrideVPV) + continue; + unsigned BW = U->getType()->getScalarSizeInBits(); + APInt C = + isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW); + VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); + StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); + } + } } void VPlanTransforms::dropPoisonGeneratingRecipes( @@ -2785,8 +3133,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step); Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags); - Init = - Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction"); + Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, + DebugLoc::getUnknown(), "induction"); // Create the widened phi of the vector IV. auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr, @@ -2983,9 +3331,11 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { R->eraseFromParent(); } -void VPlanTransforms::handleUncountableEarlyExit( - VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, - VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) { +void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, + VPBasicBlock *EarlyExitVPBB, + VPlan &Plan, + VPBasicBlock *HeaderVPBB, + VPBasicBlock *LatchVPBB) { VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0]; if (!EarlyExitVPBB->getSinglePredecessor() && EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) { @@ -3038,13 +3388,7 @@ void VPlanTransforms::handleUncountableEarlyExit( } VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx); - auto IsVector = [](ElementCount VF) { return VF.isVector(); }; - // When the VFs are vectors, need to add `extract` to get the incoming value - // from early exit. When the range contains scalar VF, limit the range to - // scalar VF to prevent mis-compilation for the range containing both scalar - // and vector VFs. - if (!IncomingFromEarlyExit->isLiveIn() && - LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) { + if (!IncomingFromEarlyExit->isLiveIn()) { // Update the incoming value from the early exit. VPValue *FirstActiveLane = EarlyExitB.createNaryOp( VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr, @@ -3125,7 +3469,7 @@ static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); - if (Opcode != Instruction::Add) + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return nullptr; Type *RedTy = Ctx.Types.inferScalarType(Red); @@ -3140,8 +3484,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Type *SrcTy = Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); - InstructionCost MulAccCost = - Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind); + InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost( + isZExt, Opcode, RedTy, SrcVecTy, CostKind); InstructionCost MulCost = Mul->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); InstructionCost ExtCost = 0; @@ -3506,6 +3850,21 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { Plan.resetTripCount(Exp); ExpSCEV->eraseFromParent(); } + assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) && + "VPExpandSCEVRecipes must be at the beginning of the entry block, " + "after any VPIRInstructions"); + // Add IR instructions in the entry basic block but not in the VPIRBasicBlock + // to the VPIRBasicBlock. + auto EI = Entry->begin(); + for (Instruction &I : drop_end(*EntryBB)) { + if (EI != Entry->end() && isa<VPIRInstruction>(*EI) && + &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) { + EI++; + continue; + } + VPIRInstruction::create(I)->insertBefore(*Entry, EI); + } + return ExpandedSCEVs; } @@ -3574,12 +3933,12 @@ static bool isAlreadyNarrow(VPValue *VPV) { void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); - if (VF.isScalable() || !VectorLoop) + if (!VectorLoop) return; VPTypeAnalysis TypeInfo(Plan); - unsigned FixedVF = VF.getFixedValue(); + unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { if (isa<VPCanonicalIVPHIRecipe>(&R) || @@ -3615,7 +3974,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, continue; // Bail out on non-consecutive interleave groups. - if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo, + if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo, VectorRegWidth)) return; @@ -3672,9 +4031,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, return; // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. - auto NarrowOp = [](VPValue *V) -> VPValue * { + SmallPtrSet<VPValue *, 4> NarrowedOps; + auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * { auto *R = V->getDefiningRecipe(); - if (!R) + if (!R || NarrowedOps.contains(V)) return V; if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { // Narrow interleave group to wide load, as transformed VPlan will only @@ -3684,6 +4044,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); + NarrowedOps.insert(L); return L; } @@ -3691,6 +4052,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, assert(RepR->isSingleScalar() && isa<LoadInst>(RepR->getUnderlyingInstr()) && "must be a single scalar load"); + NarrowedOps.insert(RepR); return RepR; } auto *WideLoad = cast<VPWidenLoadRecipe>(R); @@ -3704,6 +4066,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, /*IsUniform*/ true, /*Mask*/ nullptr, *WideLoad); N->insertBefore(WideLoad); + NarrowedOps.insert(N); return N; }; @@ -3734,10 +4097,21 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // original iteration. auto *CanIV = Plan.getCanonicalIV(); auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); - Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get( - CanIV->getScalarType(), 1 * Plan.getUF()))); - Plan.getVF().replaceAllUsesWith( - Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + VPBuilder PHBuilder(Plan.getVectorPreheader()); + + VPValue *UF = Plan.getOrAddLiveIn( + ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); + if (VF.isScalable()) { + VPValue *VScale = PHBuilder.createElementCount( + CanIV->getScalarType(), ElementCount::getScalable(1)); + VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); + Inc->setOperand(1, VScaleUF); + Plan.getVF().replaceAllUsesWith(VScale); + } else { + Inc->setOperand(1, UF); + Plan.getVF().replaceAllUsesWith( + Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + } removeDeadRecipes(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 700b94621d5f..1957428fab79 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -62,16 +62,47 @@ struct VPlanTransforms { /// The created loop is wrapped in an initial skeleton to facilitate /// vectorization, consisting of a vector pre-header, an exit block for the /// main vector loop (middle.block) and a new block as preheader of the scalar - /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p - /// InductionTy and \p IVDL, and creates a VPValue expression for the original - /// trip count. + /// loop (scalar.ph). See below for an illustration. It also adds a canonical + /// IV and its increment, using \p InductionTy and \p IVDL, and creates a + /// VPValue expression for the original trip count. + /// + /// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's + /// / \ old preheader. Will contain iteration number check and SCEV + /// | | expansions. + /// | | + /// / v + /// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be + /// | / | added later. + /// | / v + /// || [ ] <-- vector pre header. + /// |/ | + /// | v + /// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized. + /// | [ ]_| + /// | | + /// | v + /// | [ ] <--- middle-block with the branch to successors + /// | / | + /// | / | + /// | | v + /// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be + /// | | replaced later by a VPIRBasicBlock wrapping the scalar + /// | | preheader basic block. + /// | | + /// v <-- edge from middle to exit iff epilogue is not required. + /// | [ ] \ + /// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, + /// | | header wrapped in VPIRBasicBlock). + /// \ | + /// \ v + /// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks. LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan> buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE); /// Update \p Plan to account for all early exits. - LLVM_ABI_FOR_TEST static void - handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range); + LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan, + bool HasUncountableExit); /// If a check is needed to guard executing the scalar epilogue loop, it will /// be added to the middle block. @@ -79,6 +110,13 @@ struct VPlanTransforms { bool RequiresScalarEpilogueCheck, bool TailFolded); + // Create a check to \p Plan to see if the vector loop should be executed. + static void addMinimumIterationCheck( + VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, + bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, + const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE); + /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's /// flat CFG into a hierarchical CFG. LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan); @@ -161,6 +199,12 @@ struct VPlanTransforms { truncateToMinimalBitwidths(VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs); + /// Replace symbolic strides from \p StridesMap in \p Plan with constants when + /// possible. + static void + replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, + const DenseMap<Value *, const SCEV *> &StridesMap); + /// Drop poison flags from recipes that may generate a poison value that is /// used after vectorization, even when their operands are not poison. Those /// recipes meet the following conditions: @@ -207,8 +251,7 @@ struct VPlanTransforms { static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, - VPBasicBlock *LatchVPBB, - VFRange &Range); + VPBasicBlock *LatchVPBB); /// Replace loop regions with explicit CFG. static void dissolveLoopRegions(VPlan &Plan); @@ -220,9 +263,10 @@ struct VPlanTransforms { /// variable vector lengths instead of fixed lengths. This transformation: /// * Makes EVL-Phi concrete. // * Removes CanonicalIV and increment. - /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc, - /// VectorTripCount) with variable-length stepping (branch-on-cond - /// EVLIVInc, TripCount). + /// * Replaces the exit condition from + /// (branch-on-count CanonicalIVInc, VectorTripCount) + /// to + /// (branch-on-cond eq AVLNext, 0) static void canonicalizeEVLLoops(VPlan &Plan); /// Lower abstract recipes to concrete ones, that can be codegen'd. @@ -242,6 +286,9 @@ struct VPlanTransforms { /// removing dead edges to their successors. static void removeBranchOnConst(VPlan &Plan); + /// Perform common-subexpression-elimination on \p Plan. + static void cse(VPlan &Plan); + /// If there's a single exit block, optimize its phi recipes that use exiting /// IV values by feeding them precomputed end values instead, possibly taken /// one step backwards. diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 4bcde8cd5d42..443df167378b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -92,18 +92,18 @@ public: void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, unsigned Part) { for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) { - auto Ins = VPV2Parts.insert({VPV, {}}); - assert(Ins.first->second.size() == Part - 1 && "earlier parts not set"); - Ins.first->second.push_back(CopyR->getVPValue(Idx)); + const auto &[V, _] = VPV2Parts.try_emplace(VPV); + assert(V->second.size() == Part - 1 && "earlier parts not set"); + V->second.push_back(CopyR->getVPValue(Idx)); } } /// Given a uniform recipe \p R, add it for all parts. void addUniformForAllParts(VPSingleDefRecipe *R) { - auto Ins = VPV2Parts.insert({R, {}}); - assert(Ins.second && "uniform value already added"); + const auto &[V, Inserted] = VPV2Parts.try_emplace(R); + assert(Inserted && "uniform value already added"); for (unsigned Part = 0; Part != UF; ++Part) - Ins.first->second.push_back(R); + V->second.push_back(R); } bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); } @@ -536,16 +536,9 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { VPBuilder Builder(RepR); if (RepR->getNumUsers() == 0) { - if (isa<StoreInst>(RepR->getUnderlyingInstr()) && - vputils::isSingleScalar(RepR->getOperand(1))) { - // Stores to invariant addresses need to store the last lane only. - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF), - Def2LaneDefs); - } else { - // Create single-scalar version of RepR for all lanes. - for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs); - } + // Create single-scalar version of RepR for all lanes. + for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs); RepR->eraseFromParent(); continue; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 700a733bf9f2..c6c1ef336982 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -65,7 +65,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { VPValue *A, *B; using namespace VPlanPatternMatch; - if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B)))) + if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1)))) return B == Plan.getTripCount() && (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_SpecificInt(1), diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 9e1d325a4d8d..77c099b27171 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -49,6 +49,8 @@ inline bool isSingleScalar(const VPValue *VPV) { case Instruction::GetElementPtr: case Instruction::ICmp: case Instruction::FCmp: + case Instruction::Select: + case VPInstruction::Not: case VPInstruction::Broadcast: case VPInstruction::PtrAdd: return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 24f6d61512ef..85c6c2c8d796 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -38,7 +38,7 @@ struct VPDoubleValueDef; class VPSlotTracker; class VPUser; class VPRecipeBase; -class VPInterleaveRecipe; +class VPInterleaveBase; class VPPhiAccessors; // This is the base class of the VPlan Def/Use graph, used for modeling the data @@ -48,7 +48,7 @@ class VPPhiAccessors; class LLVM_ABI_FOR_TEST VPValue { friend class VPDef; friend struct VPDoubleValueDef; - friend class VPInterleaveRecipe; + friend class VPInterleaveBase; friend class VPlan; friend class VPExpressionRecipe; @@ -335,6 +335,7 @@ public: VPExpressionSC, VPIRInstructionSC, VPInstructionSC, + VPInterleaveEVLSC, VPInterleaveSC, VPReductionEVLSC, VPReductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index e25ffe135418..99f3bc367a54 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { } return VerifyEVLUse(*R, 2); }) - .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>( + .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe, + VPInterleaveEVLRecipe>( [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); }) .Case<VPInstructionWithType>( [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); }) @@ -412,7 +413,7 @@ bool VPlanVerifier::verifyRegion(const VPRegionBlock *Region) { const VPBlockBase *Exiting = Region->getExiting(); // Entry and Exiting shouldn't have any predecessor/successor, respectively. - if (Entry->getNumPredecessors() != 0) { + if (Entry->hasPredecessors()) { errs() << "region entry block has predecessors\n"; return false; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 092a3a87954f..17cb18a22336 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -99,6 +99,10 @@ private: InstructionWorklist Worklist; + /// Next instruction to iterate. It will be updated when it is erased by + /// RecursivelyDeleteTriviallyDeadInstructions. + Instruction *NextInst; + // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" // parameter. That should be updated to specific sub-classes because the // run loop was changed to dispatch on opcode. @@ -118,6 +122,7 @@ private: bool foldInsExtBinop(Instruction &I); bool foldInsExtVectorToShuffle(Instruction &I); bool foldBitOpOfCastops(Instruction &I); + bool foldBitOpOfCastConstant(Instruction &I); bool foldBitcastShuffle(Instruction &I); bool scalarizeOpOrCmp(Instruction &I); bool scalarizeVPIntrinsic(Instruction &I); @@ -169,13 +174,16 @@ private: // further folds that were hindered by OneUse limits. SmallPtrSet<Value *, 4> Visited; for (Value *Op : Ops) { - if (Visited.insert(Op).second) { + if (!Visited.contains(Op)) { if (auto *OpI = dyn_cast<Instruction>(Op)) { if (RecursivelyDeleteTriviallyDeadInstructions( - OpI, nullptr, nullptr, [this](Value *V) { - if (auto I = dyn_cast<Instruction>(V)) { + OpI, nullptr, nullptr, [&](Value *V) { + if (auto *I = dyn_cast<Instruction>(V)) { LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n'); Worklist.remove(I); + if (I == NextInst) + NextInst = NextInst->getNextNode(); + Visited.insert(I); } })) continue; @@ -862,14 +870,17 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) { if (LHSSrc->getType() != RHSSrc->getType()) return false; - // Only handle vector types with integer elements - auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType()); - auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType()); - if (!SrcVecTy || !DstVecTy) + auto *SrcTy = LHSSrc->getType(); + auto *DstTy = I.getType(); + // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>. + // Other casts only handle vector types with integer elements. + if (CastOpcode != Instruction::BitCast && + (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy))) return false; - if (!SrcVecTy->getScalarType()->isIntegerTy() || - !DstVecTy->getScalarType()->isIntegerTy()) + // Only integer scalar/vector values are legal for bitwise logic operations. + if (!SrcTy->getScalarType()->isIntegerTy() || + !DstTy->getScalarType()->isIntegerTy()) return false; // Cost Check : @@ -877,23 +888,21 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) { // NewCost = bitlogic + cast // Calculate specific costs for each cast with instruction context - InstructionCost LHSCastCost = - TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy, - TTI::CastContextHint::None, CostKind, LHSCast); - InstructionCost RHSCastCost = - TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy, - TTI::CastContextHint::None, CostKind, RHSCast); + InstructionCost LHSCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast); + InstructionCost RHSCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast); InstructionCost OldCost = - TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) + + TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) + LHSCastCost + RHSCastCost; // For new cost, we can't provide an instruction (it doesn't exist yet) InstructionCost GenericCastCost = TTI.getCastInstrCost( - CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind); + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind); InstructionCost NewCost = - TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) + + TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) + GenericCastCost; // Account for multi-use casts using specific costs @@ -930,6 +939,102 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) { return true; } +/// Match: +// bitop(castop(x), C) -> +// bitop(castop(x), castop(InvC)) -> +// castop(bitop(x, InvC)) +// Supports: bitcast +bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) { + Instruction *LHS; + Constant *C; + + // Check if this is a bitwise logic operation + if (!match(&I, m_c_BitwiseLogic(m_Instruction(LHS), m_Constant(C)))) + return false; + + // Get the cast instructions + auto *LHSCast = dyn_cast<CastInst>(LHS); + if (!LHSCast) + return false; + + Instruction::CastOps CastOpcode = LHSCast->getOpcode(); + + // Only handle supported cast operations + switch (CastOpcode) { + case Instruction::BitCast: + break; + default: + return false; + } + + Value *LHSSrc = LHSCast->getOperand(0); + + auto *SrcTy = LHSSrc->getType(); + auto *DstTy = I.getType(); + // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>. + // Other casts only handle vector types with integer elements. + if (CastOpcode != Instruction::BitCast && + (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy))) + return false; + + // Only integer scalar/vector values are legal for bitwise logic operations. + if (!SrcTy->getScalarType()->isIntegerTy() || + !DstTy->getScalarType()->isIntegerTy()) + return false; + + // Find the constant InvC, such that castop(InvC) equals to C. + PreservedCastFlags RHSFlags; + Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags); + if (!InvC) + return false; + + // Cost Check : + // OldCost = bitlogic + cast + // NewCost = bitlogic + cast + + // Calculate specific costs for each cast with instruction context + InstructionCost LHSCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast); + + InstructionCost OldCost = + TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost; + + // For new cost, we can't provide an instruction (it doesn't exist yet) + InstructionCost GenericCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind); + + InstructionCost NewCost = + TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) + + GenericCastCost; + + // Account for multi-use casts using specific costs + if (!LHSCast->hasOneUse()) + NewCost += LHSCastCost; + + LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost + << " NewCost=" << NewCost << "\n"); + + if (NewCost > OldCost) + return false; + + // Create the operation on the source type + Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), + LHSSrc, InvC, I.getName() + ".inner"); + if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp)) + NewBinOp->copyIRFlags(&I); + + Worklist.pushValue(NewOp); + + // Create the cast operation directly to ensure we get a new instruction + Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType()); + + // Insert the new instruction + Value *Result = Builder.Insert(NewCast); + + replaceValue(I, *Result); + return true; +} + /// If this is a bitcast of a shuffle, try to bitcast the source vector to the /// destination type followed by shuffle. This can enable further transforms by /// moving bitcasts or shuffles together. @@ -1461,8 +1566,8 @@ static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::CastContextHint::None, CostKind, RedOp); CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost; - CostAfterReduction = - TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind); + CostAfterReduction = TTI.getMulAccReductionCost( + IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind); return; } CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy, @@ -3753,6 +3858,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { unsigned MaxVectorSize = TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); unsigned MaxElementsInVector = MaxVectorSize / ElementSize; + if (MaxElementsInVector == 0) + return false; // When there are multiple shufflevector operations on the same input, // especially when the vector length is larger than the register size, // identical shuffle patterns may occur across different groups of elements. @@ -4467,6 +4574,8 @@ bool VectorCombine::run() { case Instruction::Xor: if (foldBitOpOfCastops(I)) return true; + if (foldBitOpOfCastConstant(I)) + return true; break; case Instruction::PHI: if (shrinkPhiOfShuffles(I)) @@ -4519,13 +4628,21 @@ bool VectorCombine::run() { if (!DT.isReachableFromEntry(&BB)) continue; // Use early increment range so that we can erase instructions in loop. - for (Instruction &I : make_early_inc_range(BB)) { - if (I.isDebugOrPseudoInst()) - continue; - MadeChange |= FoldInst(I); + // make_early_inc_range is not applicable here, as the next iterator may + // be invalidated by RecursivelyDeleteTriviallyDeadInstructions. + // We manually maintain the next instruction and update it when it is about + // to be deleted. + Instruction *I = &BB.front(); + while (I) { + NextInst = I->getNextNode(); + if (!I->isDebugOrPseudoInst()) + MadeChange |= FoldInst(*I); + I = NextInst; } } + NextInst = nullptr; + while (!Worklist.isEmpty()) { Instruction *I = Worklist.removeOne(); if (!I) |
