summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorMingming Liu <mingmingl@google.com>2025-09-10 15:25:31 -0700
committerGitHub <noreply@github.com>2025-09-10 15:25:31 -0700
commit1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/lib/Transforms/Vectorize
parent898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parentb8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp108
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp172
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h24
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp931
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp338
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp146
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h250
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp93
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h47
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp410
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp542
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h67
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp25
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h5
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp165
20 files changed, 2304 insertions, 1033 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 491f0b76f4ae..53129e2e5fbb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -170,10 +170,10 @@ private:
bool recognizeFindFirstByte();
Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
- unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
- BasicBlock *ExitFail, Value *SearchStart,
- Value *SearchEnd, Value *NeedleStart,
- Value *NeedleEnd);
+ unsigned VF, Type *CharTy, Value *IndPhi,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ Value *SearchStart, Value *SearchEnd,
+ Value *NeedleStart, Value *NeedleEnd);
void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy,
BasicBlock *ExitSucc, BasicBlock *ExitFail,
@@ -242,6 +242,37 @@ bool LoopIdiomVectorize::run(Loop *L) {
return false;
}
+static void fixSuccessorPhis(Loop *L, Value *ScalarRes, Value *VectorRes,
+ BasicBlock *SuccBB, BasicBlock *IncBB) {
+ for (PHINode &PN : SuccBB->phis()) {
+ // Look through the incoming values to find ScalarRes, meaning this is a
+ // PHI collecting the results of the transformation.
+ bool ResPhi = false;
+ for (Value *Op : PN.incoming_values())
+ if (Op == ScalarRes) {
+ ResPhi = true;
+ break;
+ }
+
+ // Any PHI that depended upon the result of the transformation needs a new
+ // incoming value from IncBB.
+ if (ResPhi)
+ PN.addIncoming(VectorRes, IncBB);
+ else {
+ // There should be no other outside uses of other values in the
+ // original loop. Any incoming values should either:
+ // 1. Be for blocks outside the loop, which aren't interesting. Or ..
+ // 2. These are from blocks in the loop with values defined outside
+ // the loop. We should a similar incoming value from CmpBB.
+ for (BasicBlock *BB : PN.blocks())
+ if (L->contains(BB)) {
+ PN.addIncoming(PN.getIncomingValueForBlock(BB), IncBB);
+ break;
+ }
+ }
+ }
+}
+
bool LoopIdiomVectorize::recognizeByteCompare() {
// Currently the transformation only works on scalable vector types, although
// there is no fundamental reason why it cannot be made to work for fixed
@@ -574,13 +605,8 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
{VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
- StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
- auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
- Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
- Value *VectorMatchCmp = Builder.CreateIntrinsic(
- Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
- {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
- "mismatch.cmp");
+ Value *VectorMatchCmp =
+ Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad, "mismatch.cmp");
Value *CTZ = Builder.CreateIntrinsic(
Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
{VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask,
@@ -940,42 +966,10 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
}
- auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {
- for (PHINode &PN : SuccBB->phis()) {
- // At this point we've already replaced all uses of the result from the
- // loop with ByteCmp. Look through the incoming values to find ByteCmp,
- // meaning this is a Phi collecting the results of the byte compare.
- bool ResPhi = false;
- for (Value *Op : PN.incoming_values())
- if (Op == ByteCmpRes) {
- ResPhi = true;
- break;
- }
-
- // Any PHI that depended upon the result of the byte compare needs a new
- // incoming value from CmpBB. This is because the original loop will get
- // deleted.
- if (ResPhi)
- PN.addIncoming(ByteCmpRes, CmpBB);
- else {
- // There should be no other outside uses of other values in the
- // original loop. Any incoming values should either:
- // 1. Be for blocks outside the loop, which aren't interesting. Or ..
- // 2. These are from blocks in the loop with values defined outside
- // the loop. We should a similar incoming value from CmpBB.
- for (BasicBlock *BB : PN.blocks())
- if (CurLoop->contains(BB)) {
- PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB);
- break;
- }
- }
- }
- };
-
// Ensure all Phis in the successors of CmpBB have an incoming value from it.
- fixSuccessorPhis(EndBB);
+ fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, EndBB, CmpBB);
if (EndBB != FoundBB)
- fixSuccessorPhis(FoundBB);
+ fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, FoundBB, CmpBB);
// The new CmpBB block isn't part of the loop, but will need to be added to
// the outer loop if there is one.
@@ -1173,8 +1167,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
Value *LoopIdiomVectorize::expandFindFirstByte(
IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy,
- BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart,
- Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) {
+ Value *IndPhi, BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ Value *SearchStart, Value *SearchEnd, Value *NeedleStart,
+ Value *NeedleEnd) {
// Set up some types and constants that we intend to reuse.
auto *PtrTy = Builder.getPtrTy();
auto *I64Ty = Builder.getInt64Ty();
@@ -1374,6 +1369,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
MatchLCSSA->addIncoming(Search, BB2);
MatchPredLCSSA->addIncoming(MatchPred, BB2);
+ // Ensure all Phis in the successors of BB3/BB5 have an incoming value from
+ // them.
+ fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitSucc, BB3);
+ if (ExitSucc != ExitFail)
+ fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitFail, BB5);
+
if (VerifyLoops) {
OuterLoop->verifyLoop();
InnerLoop->verifyLoop();
@@ -1395,21 +1396,12 @@ void LoopIdiomVectorize::transformFindFirstByte(
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
- Value *MatchVal =
- expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail,
- SearchStart, SearchEnd, NeedleStart, NeedleEnd);
+ expandFindFirstByte(Builder, DTU, VF, CharTy, IndPhi, ExitSucc, ExitFail,
+ SearchStart, SearchEnd, NeedleStart, NeedleEnd);
assert(PHBranch->isUnconditional() &&
"Expected preheader to terminate with an unconditional branch.");
- // Add new incoming values with the result of the transformation to PHINodes
- // of ExitSucc that use IndPhi.
- for (auto *U : llvm::make_early_inc_range(IndPhi->users())) {
- auto *PN = dyn_cast<PHINode>(U);
- if (PN && PN->getParent() == ExitSucc)
- PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
- }
-
if (VerifyLoops && CurLoop->getParentLoop()) {
CurLoop->getParentLoop()->verifyLoop();
if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 789047a2a28e..2704e66f3a70 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -15,8 +15,10 @@
//
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
});
}
- if (!LAI->canVectorizeMemory())
+ if (!LAI->canVectorizeMemory()) {
+ if (hasUncountableExitWithSideEffects()) {
+ reportVectorizationFailure(
+ "Cannot vectorize unsafe dependencies in uncountable exit loop with "
+ "side effects",
+ "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
+ TheLoop);
+ return false;
+ }
+
return canVectorizeIndirectUnsafeDependences();
+ }
if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
reportVectorizationFailure("We don't allow storing to uniform addresses",
@@ -1530,7 +1542,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!isGuaranteedNotToBePoison(CurrV, AC,
TheLoop->getLoopPredecessor()
->getTerminator()
- ->getIterator()))
+ ->getIterator(),
+ DT))
return false;
continue;
}
@@ -1754,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
}
};
+ bool HasSideEffects = false;
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
if (I.mayWriteToMemory()) {
- // We don't support writes to memory.
+ if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) {
+ HasSideEffects = true;
+ continue;
+ }
+
+ // We don't support complex writes to memory.
reportVectorizationFailure(
- "Writes to memory unsupported in early exit loops",
- "Cannot vectorize early exit loop with writes to memory",
+ "Complex writes to memory unsupported in early exit loops",
+ "Cannot vectorize early exit loop with complex writes to memory",
"WritesInEarlyExitLoop", ORE, TheLoop);
return false;
- } else if (!IsSafeOperation(&I)) {
+ }
+
+ if (!IsSafeOperation(&I)) {
reportVectorizationFailure("Early exit loop contains operations that "
"cannot be speculatively executed",
"UnsafeOperationsEarlyExitLoop", ORE,
@@ -1776,15 +1797,37 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock &&
"Expected latch predecessor to be the early exiting block");
+ SmallVector<LoadInst *, 4> NonDerefLoads;
// TODO: Handle loops that may fault.
- Predicates.clear();
- if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
- &Predicates)) {
- reportVectorizationFailure(
- "Loop may fault",
- "Cannot vectorize potentially faulting early exit loop",
- "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ if (!HasSideEffects) {
+ // Read-only loop.
+ Predicates.clear();
+ if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads,
+ &Predicates)) {
+ reportVectorizationFailure(
+ "Loop may fault", "Cannot vectorize non-read-only early exit loop",
+ "NonReadOnlyEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+ } else if (!canUncountableExitConditionLoadBeMoved(
+ SingleUncountableExitingBlock))
return false;
+
+ // Check non-dereferenceable loads if any.
+ for (LoadInst *LI : NonDerefLoads) {
+ // Only support unit-stride access for now.
+ int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand());
+ if (Stride != 1) {
+ reportVectorizationFailure(
+ "Loop contains potentially faulting strided load",
+ "Cannot vectorize early exit loop with "
+ "strided fault-only-first load",
+ "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop);
+ return false;
+ }
+ PotentiallyFaultingLoads.insert(LI);
+ LLVM_DEBUG(dbgs() << "LV: Found potentially faulting load: " << *LI
+ << "\n");
}
[[maybe_unused]] const SCEV *SymbolicMaxBTC =
@@ -1797,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
"backedge taken count: "
<< *SymbolicMaxBTC << '\n');
UncountableExitingBB = SingleUncountableExitingBlock;
+ UncountableExitWithSideEffects = HasSideEffects;
+ return true;
+}
+
+bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
+ BasicBlock *ExitingBlock) {
+ // Try to find a load in the critical path for the uncountable exit condition.
+ // This is currently matching about the simplest form we can, expecting
+ // only one in-loop load, the result of which is directly compared against
+ // a loop-invariant value.
+ // FIXME: We're insisting on a single use for now, because otherwise we will
+ // need to make PHI nodes for other users. That can be done once the initial
+ // transform code lands.
+ auto *Br = cast<BranchInst>(ExitingBlock->getTerminator());
+
+ using namespace llvm::PatternMatch;
+ Instruction *L = nullptr;
+ Value *Ptr = nullptr;
+ Value *R = nullptr;
+ if (!match(Br->getCondition(),
+ m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))),
+ m_Value(R))))) {
+ reportVectorizationFailure(
+ "Early exit loop with store but no supported condition load",
+ "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
+ // FIXME: Don't rely on operand ordering for the comparison.
+ if (!TheLoop->isLoopInvariant(R)) {
+ reportVectorizationFailure(
+ "Early exit loop with store but no supported condition load",
+ "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
+ // Make sure that the load address is not loop invariant; we want an
+ // address calculation that we can rotate to the next vector iteration.
+ const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
+ if (!isa<SCEVAddRecExpr>(PtrScev)) {
+ reportVectorizationFailure(
+ "Uncountable exit condition depends on load with an address that is "
+ "not an add recurrence",
+ "EarlyExitLoadInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+
+ // FIXME: Support gathers after first-faulting load support lands.
+ SmallVector<const SCEVPredicate *, 4> Predicates;
+ LoadInst *Load = cast<LoadInst>(L);
+ if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC,
+ &Predicates)) {
+ reportVectorizationFailure(
+ "Loop may fault",
+ "Cannot vectorize potentially faulting early exit loop",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
+ ICFLoopSafetyInfo SafetyInfo;
+ SafetyInfo.computeLoopSafetyInfo(TheLoop);
+ // We need to know that load will be executed before we can hoist a
+ // copy out to run just before the first iteration.
+ // FIXME: Currently, other restrictions prevent us from reaching this point
+ // with a loop where the uncountable exit condition is determined
+ // by a conditional load.
+ assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) &&
+ "Unhandled control flow in uncountable exit loop with side effects");
+
+ // Prohibit any potential aliasing with any instruction in the loop which
+ // might store to memory.
+ // FIXME: Relax this constraint where possible.
+ for (auto *BB : TheLoop->blocks()) {
+ for (auto &I : *BB) {
+ if (&I == Load)
+ continue;
+
+ if (I.mayWriteToMemory()) {
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ AliasResult AR = AA->alias(Ptr, SI->getPointerOperand());
+ if (AR == AliasResult::NoAlias)
+ continue;
+ }
+
+ reportVectorizationFailure(
+ "Cannot determine whether critical uncountable exit load address "
+ "does not alias with a memory write",
+ "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -1869,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
} else {
if (!isVectorizableEarlyExitLoop()) {
assert(!hasUncountableEarlyExit() &&
+ !hasUncountableExitWithSideEffects() &&
"Must be false without vectorizable early-exit loop");
if (DoExtraAnalysis)
Result = false;
@@ -1887,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
return false;
}
+ // Bail out for state-changing loops with uncountable exits for now.
+ if (UncountableExitWithSideEffects) {
+ reportVectorizationFailure(
+ "Writes to memory unsupported in early exit loops",
+ "Cannot vectorize early exit loop with writes to memory",
+ "WritesInEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
if (Result) {
LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerChecking()->Need
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 838476dcae66..d34d2ae7a0b3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -334,6 +334,10 @@ public:
FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL));
}
+ VPExpandSCEVRecipe *createExpandSCEV(const SCEV *Expr) {
+ return tryInsertInstruction(new VPExpandSCEVRecipe(Expr));
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
@@ -559,6 +563,20 @@ public:
/// Emit remarks for recipes with invalid costs in the available VPlans.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
+ /// Create a check to \p Plan to see if the vector loop should be executed
+ /// based on its trip count.
+ void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const;
+
+ /// Update loop metadata and profile info for both the scalar remainder loop
+ /// and \p VectorLoop, if it exists. Keeps all loop hints from the original
+ /// loop on the vector loop and replaces vectorizer-specific metadata.
+ void updateLoopMetadataAndProfileInfo(Loop *VectorLoop,
+ VPBasicBlock *HeaderVPBB,
+ bool VectorizingEpilogue,
+ unsigned EstimatedVFxUF,
+ bool DisableRuntimeUnroll);
+
protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
@@ -613,13 +631,15 @@ private:
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B.
bool isMoreProfitable(const VectorizationFactor &A,
- const VectorizationFactor &B, bool HasTail) const;
+ const VectorizationFactor &B, bool HasTail,
+ bool IsEpilogue = false) const;
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B in the context of vectorizing a loop with known \p MaxTripCount.
bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
- const unsigned MaxTripCount, bool HasTail) const;
+ const unsigned MaxTripCount, bool HasTail,
+ bool IsEpilogue = false) const;
/// Determines if we have the infrastructure to vectorize the loop and its
/// epilogue, assuming the main loop is vectorized by \p VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a0f306c12754..3cff43a51029 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -165,15 +165,6 @@ using namespace SCEVPatternMatch;
const char VerboseDebug[] = DEBUG_TYPE "-verbose";
#endif
-/// @{
-/// Metadata attribute names
-const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
-const char LLVMLoopVectorizeFollowupVectorized[] =
- "llvm.loop.vectorize.followup_vectorized";
-const char LLVMLoopVectorizeFollowupEpilogue[] =
- "llvm.loop.vectorize.followup_epilogue";
-/// @}
-
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
@@ -500,26 +491,22 @@ public:
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
LoopInfo *LI, DominatorTree *DT,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- ElementCount VecWidth,
- ElementCount MinProfitableTripCount,
- unsigned UnrollFactor, LoopVectorizationCostModel *CM,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &RTChecks, VPlan &Plan)
+ ElementCount VecWidth, unsigned UnrollFactor,
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ VPlan &Plan)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
- VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount),
- UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM),
- BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+ VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
+ Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
VectorPHVPBB(cast<VPBasicBlock>(
Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
virtual ~InnerLoopVectorizer() = default;
- /// Create a new empty loop that will contain vectorized instructions later
- /// on, while the old loop will be used as the scalar remainder. Control flow
- /// is generated around the vectorized (and scalar epilogue) loops consisting
- /// of various checks and bypasses. Return the pre-header block of the new
- /// loop. In the case of epilogue vectorization, this function is overriden to
- /// handle the more complex control flow around the loops.
+ /// Creates a basic block for the scalar preheader. Both
+ /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
+ /// the method to create additional blocks and checks needed for epilogue
+ /// vectorization.
virtual BasicBlock *createVectorizedLoopSkeleton();
/// Fix the vectorized code, taking care of header phi's, and more.
@@ -536,38 +523,18 @@ public:
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
- /// Return the additional bypass block which targets the scalar loop by
- /// skipping the epilogue loop after completing the main loop.
- BasicBlock *getAdditionalBypassBlock() const {
- assert(AdditionalBypassBlock &&
- "Trying to access AdditionalBypassBlock but it has not been set");
- return AdditionalBypassBlock;
- }
-
protected:
friend class LoopVectorizationPlanner;
- // Create a check to see if the vector loop should be executed
- Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
-
- /// Emit a bypass check to see if the vector trip count is zero, including if
- /// it overflows.
- void emitIterationCountCheck(BasicBlock *Bypass);
-
- /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
- /// vector loop preheader, middle block and scalar preheader.
- void createVectorLoopSkeleton(StringRef Prefix);
+ /// Create and return a new IR basic block for the scalar preheader whose name
+ /// is prefixed with \p Prefix.
+ BasicBlock *createScalarPreheader(StringRef Prefix);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
virtual void printDebugTracesAtStart() {}
virtual void printDebugTracesAtEnd() {}
- /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
- /// vector preheader and its predecessor, also connecting the new block to the
- /// scalar preheader.
- void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
-
/// The original loop.
Loop *OrigLoop;
@@ -592,8 +559,6 @@ protected:
/// vector elements.
ElementCount VF;
- ElementCount MinProfitableTripCount;
-
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
@@ -603,18 +568,9 @@ protected:
// --- Vectorization state ---
- /// The vector-loop preheader.
- BasicBlock *LoopVectorPreHeader = nullptr;
-
- /// The scalar-loop preheader.
- BasicBlock *LoopScalarPreHeader = nullptr;
-
/// Trip count of the original loop.
Value *TripCount = nullptr;
- /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
- Value *VectorTripCount = nullptr;
-
/// The profitablity analysis.
LoopVectorizationCostModel *Cost;
@@ -626,11 +582,6 @@ protected:
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
- /// The additional bypass block which conditionally skips over the epilogue
- /// loop after executing the main loop. Needed to resume inductions and
- /// reductions during epilogue vectorization.
- BasicBlock *AdditionalBypassBlock = nullptr;
-
VPlan &Plan;
/// The vector preheader block of \p Plan, used as target for check blocks
@@ -679,20 +630,8 @@ public:
GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
ElementCount MinProfitableTripCount, unsigned UnrollFactor)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
- MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
- Checks, Plan),
- EPI(EPI) {}
-
- // Override this function to handle the more complex control flow around the
- // three loops.
- BasicBlock *createVectorizedLoopSkeleton() final {
- return createEpilogueVectorizedLoopSkeleton();
- }
-
- /// The interface for creating a vectorized skeleton using one of two
- /// different strategies, each corresponding to one execution of the vplan
- /// as described above.
- virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+ UnrollFactor, CM, BFI, PSI, Checks, Plan),
+ EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
/// Holds and updates state information required to vectorize the main loop
/// and its epilogue in two separate passes. This setup helps us avoid
@@ -701,6 +640,9 @@ public:
/// iteration count of the loop is so small that the main vector loop is
/// completely skipped.
EpilogueLoopVectorizationInfo &EPI;
+
+protected:
+ ElementCount MinProfitableTripCount;
};
/// A specialized derived class of inner loop vectorizer that performs
@@ -720,14 +662,24 @@ public:
BFI, PSI, Check, Plan, EPI.MainLoopVF,
EPI.MainLoopVF, EPI.MainLoopUF) {}
/// Implements the interface for creating a vectorized skeleton using the
- /// *main loop* strategy (ie the first pass of vplan execution).
- BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
+ /// *main loop* strategy (i.e., the first pass of VPlan execution).
+ BasicBlock *createVectorizedLoopSkeleton() final;
protected:
+ /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
+ /// vector preheader and its predecessor, also connecting the new block to the
+ /// scalar preheader.
+ void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
+
+ // Create a check to see if the main vector loop should be executed
+ Value *createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF,
+ unsigned UF) const;
+
/// Emits an iteration count bypass check once for the main loop (when \p
/// ForEpilogue is false) and once for the epilogue loop (when \p
/// ForEpilogue is true).
- BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
+ BasicBlock *emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass,
+ bool ForEpilogue);
void printDebugTracesAtStart() override;
void printDebugTracesAtEnd() override;
};
@@ -736,6 +688,11 @@ protected:
// vectorization of *epilogue* loops in the process of vectorizing loops and
// their epilogues.
class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
+ /// The additional bypass block which conditionally skips over the epilogue
+ /// loop after executing the main loop. Needed to resume inductions and
+ /// reductions during epilogue vectorization.
+ BasicBlock *AdditionalBypassBlock = nullptr;
+
public:
EpilogueVectorizerEpilogueLoop(
Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -749,14 +706,22 @@ public:
TripCount = EPI.TripCount;
}
/// Implements the interface for creating a vectorized skeleton using the
- /// *epilogue loop* strategy (ie the second pass of vplan execution).
- BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
+ /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
+ BasicBlock *createVectorizedLoopSkeleton() final;
+
+ /// Return the additional bypass block which targets the scalar loop by
+ /// skipping the epilogue loop after completing the main loop.
+ BasicBlock *getAdditionalBypassBlock() const {
+ assert(AdditionalBypassBlock &&
+ "Trying to access AdditionalBypassBlock but it has not been set");
+ return AdditionalBypassBlock;
+ }
protected:
/// Emits an iteration count bypass check after the main vector loop has
/// finished to see if there are any iterations left to execute by either
/// the vector epilogue or the scalar epilogue.
- BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
+ BasicBlock *emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH,
BasicBlock *Bypass,
BasicBlock *Insert);
void printDebugTracesAtStart() override;
@@ -962,8 +927,8 @@ public:
/// user options, for the given register kind.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
- /// \return True if register pressure should be calculated for the given VF.
- bool shouldCalculateRegPressureForVF(ElementCount VF);
+ /// \return True if register pressure should be considered for the given VF.
+ bool shouldConsiderRegPressureForVF(ElementCount VF);
/// \return The size (in bits) of the smallest and widest types in the code
/// that needs to be vectorized. We ignore values that remain scalar such as
@@ -1159,7 +1124,10 @@ public:
CallWideningDecision getCallWideningDecision(CallInst *CI,
ElementCount VF) const {
assert(!VF.isScalar() && "Expected vector VF");
- return CallWideningDecisions.at({CI, VF});
+ auto I = CallWideningDecisions.find({CI, VF});
+ if (I == CallWideningDecisions.end())
+ return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
+ return I->second;
}
/// Return True if instruction \p I is an optimizable truncate whose operand
@@ -1682,7 +1650,9 @@ private:
Instruction *I = dyn_cast<Instruction>(V);
if (VF.isScalar() || !I || !TheLoop->contains(I) ||
TheLoop->isLoopInvariant(I) ||
- getWideningDecision(I, VF) == CM_Scalarize)
+ getWideningDecision(I, VF) == CM_Scalarize ||
+ (isa<CallInst>(I) &&
+ getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
return false;
// Assume we can vectorize V (and hence we need extraction) if the
@@ -1878,6 +1848,8 @@ public:
"claimed checks are required");
}
+ SCEVExp.eraseDeadInstructions(SCEVCheckCond);
+
if (!MemCheckBlock && !SCEVCheckBlock)
return;
@@ -2030,7 +2002,7 @@ public:
/// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
/// outside VPlan.
- std::pair<Value *, BasicBlock *> getSCEVChecks() {
+ std::pair<Value *, BasicBlock *> getSCEVChecks() const {
using namespace llvm::PatternMatch;
if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
return {nullptr, nullptr};
@@ -2040,7 +2012,7 @@ public:
/// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
/// outside VPlan.
- std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
+ std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
using namespace llvm::PatternMatch;
if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
return {nullptr, nullptr};
@@ -2049,9 +2021,7 @@ public:
/// Return true if any runtime checks have been added
bool hasChecks() const {
- using namespace llvm::PatternMatch;
- return (SCEVCheckCond && !match(SCEVCheckCond, m_ZeroInt())) ||
- MemRuntimeCheckCond;
+ return getSCEVChecks().first || getMemRuntimeChecks().first;
}
};
} // namespace
@@ -2276,7 +2246,8 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
return TTI.enableMaskedInterleavedAccessVectorization();
}
-void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
+void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan(
+ BasicBlock *CheckIRBB) {
// Note: The block with the minimum trip-count check is already connected
// during earlier VPlan construction.
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
@@ -2300,8 +2271,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
}
}
-Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
- unsigned UF) const {
+Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
+ BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
// Generate code to check if the loop's trip count is less than VF * UF, or
// equal to it in case a scalar epilogue is required; this implies that the
// vector trip count is zero. This check also covers the case where adding one
@@ -2312,7 +2283,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+ BasicBlock *const TCCheckBlock = VectorPH;
IRBuilder<InstSimplifyFolder> Builder(
TCCheckBlock->getContext(),
InstSimplifyFolder(TCCheckBlock->getDataLayout()));
@@ -2371,25 +2342,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
return CheckMinIters;
}
-void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
- Value *CheckMinIters = createIterationCountCheck(VF, UF);
- // Create new preheader for vector loop.
- LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
- static_cast<DominatorTree *>(nullptr), LI,
- nullptr, "vector.ph");
-
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
- setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
- ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
-
- assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
- TCCheckBlock &&
- "Plan's entry must be TCCCheckBlock");
-}
-
/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2410,20 +2362,19 @@ static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
return IRVPBB;
}
-void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
- LoopVectorPreHeader = OrigLoop->getLoopPreheader();
- assert(LoopVectorPreHeader && "Invalid loop structure");
+BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
+ BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
+ assert(VectorPH && "Invalid loop structure");
assert((OrigLoop->getUniqueLatchExitBlock() ||
Cost->requiresScalarEpilogue(VF.isVector())) &&
"loops not exiting via the latch without required epilogue?");
- LoopScalarPreHeader =
- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
- LI, nullptr, Twine(Prefix) + "scalar.ph");
// NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
- // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
- // preheader may be unreachable at this point. Instead it is replaced in
- // createVectorizedLoopSkeleton.
+ // wrapping the newly created scalar preheader here at the moment, because the
+ // Plan's scalar preheader may be unreachable at this point. Instead it is
+ // replaced in executePlan.
+ return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
+ Twine(Prefix) + "scalar.ph");
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2464,54 +2415,9 @@ static void addFullyUnrolledInstructionsToIgnore(
}
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
- /*
- In this function we generate a new loop. The new loop will contain
- the vectorized instructions while the old loop will continue to run the
- scalar remainder.
-
- [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
- / | preheader are expanded here. Eventually all required SCEV
- / | expansion should happen here.
- / v
- | [ ] <-- vector loop bypass (may consist of multiple blocks).
- | / |
- | / v
- || [ ] <-- vector pre header.
- |/ |
- | v
- | [ ] \
- | [ ]_| <-- vector loop (created during VPlan execution).
- | |
- | v
- \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
- | | successors created during VPlan execution)
- \/ |
- /\ v
- | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
- | |
- (opt) v <-- edge from middle to exit iff epilogue is not required.
- | [ ] \
- | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
- | | wrapped in VPIRBasicBlock).
- \ |
- \ v
- >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
- ...
- */
-
- // Create an empty vector loop, and prepare basic blocks for the runtime
- // checks.
- createVectorLoopSkeleton("");
-
- // Now, compare the new count to zero. If it is zero skip the vector loop and
- // jump to the scalar loop. This check also covers the case where the
- // backedge-taken count is uint##_max: adding one to it will overflow leading
- // to an incorrect trip count of zero. In this (rare) case we will also jump
- // to the scalar loop.
- emitIterationCountCheck(LoopScalarPreHeader);
-
- replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
- return LoopVectorPreHeader;
+ // Create a new IR basic block for the scalar preheader.
+ BasicBlock *ScalarPH = createScalarPreheader("");
+ return ScalarPH->getSinglePredecessor();
}
namespace {
@@ -2652,24 +2558,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Remove redundant induction instructions.
cse(HeaderBB);
-
- // Set/update profile weights for the vector and remainder loops as original
- // loop iterations are now distributed among them. Note that original loop
- // becomes the scalar remainder loop after vectorization.
- //
- // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
- // end up getting slightly roughened result but that should be OK since
- // profile is not inherently precise anyway. Note also possible bypass of
- // vector code caused by legality checks is ignored, assigning all the weight
- // to the vector loop, optimistically.
- //
- // For scalable vectorization we can't know at compile time how many
- // iterations of the loop are handled in one vector iteration, so instead
- // use the value of vscale used for tuning.
- Loop *VectorLoop = LI->getLoopFor(HeaderBB);
- unsigned EstimatedVFxUF =
- estimateElementCount(VF * UF, Cost->getVScaleForTuning());
- setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
}
void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -3020,19 +2908,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
toVectorTy(Type::getInt1Ty(I->getContext()), VF),
CmpInst::BAD_ICMP_PREDICATE, CostKind);
- // Certain instructions can be cheaper to vectorize if they have a constant
- // second vector operand. One example of this are shifts on x86.
- Value *Op2 = I->getOperand(1);
- auto Op2Info = TTI.getOperandInfo(Op2);
- if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
- Legal->isInvariant(Op2))
- Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
-
SmallVector<const Value *, 4> Operands(I->operand_values());
SafeDivisorCost += TTI.getArithmeticInstrCost(
- I->getOpcode(), VecTy, CostKind,
- {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
- Op2Info, Operands, I);
+ I->getOpcode(), VecTy, CostKind,
+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+ Operands, I);
return {ScalarizationCost, SafeDivisorCost};
}
@@ -3810,7 +3691,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}
-bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
+bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF(
ElementCount VF) {
if (!useMaxBandwidth(VF.isScalable()
? TargetTransformInfo::RGK_ScalableVector
@@ -3939,7 +3820,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
const unsigned MaxTripCount,
- bool HasTail) const {
+ bool HasTail,
+ bool IsEpilogue) const {
InstructionCost CostA = A.Cost;
InstructionCost CostB = B.Cost;
@@ -3963,7 +3845,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
// Assume vscale may be larger than 1 (or the value being tuned for),
// so that scalable vectorization is slightly favorable over fixed-width
// vectorization.
- bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
+ bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
A.Width.isScalable() && !B.Width.isScalable();
auto CmpFn = [PreferScalable](const InstructionCost &LHS,
@@ -4001,10 +3883,11 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
- bool HasTail) const {
+ bool HasTail,
+ bool IsEpilogue) const {
const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
- return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
- HasTail);
+ return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
+ IsEpilogue);
}
void LoopVectorizationPlanner::emitInvalidCostRemarks(
@@ -4171,6 +4054,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntOrFpInductionSC:
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPReductionPHISC:
+ case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
@@ -4199,8 +4083,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
// If no def nor is a store, e.g., branches, continue - no value to check.
if (R.getNumDefinedValues() == 0 &&
- !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
- &R))
+ !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(&R))
continue;
// For multi-def recipes, currently only interleaved loads, suffice to
// check first def only.
@@ -4255,8 +4138,9 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
P->vectorFactors().end());
SmallVector<VPRegisterUsage, 8> RUs;
- if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
- CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+ if (any_of(VFs, [this](ElementCount VF) {
+ return CM.shouldConsiderRegPressureForVF(VF);
+ }))
RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
for (unsigned I = 0; I < VFs.size(); I++) {
@@ -4268,7 +4152,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
/// If the register pressure needs to be considered for VF,
/// don't consider the VF as valid if it exceeds the number
/// of registers for the target.
- if (CM.shouldCalculateRegPressureForVF(VF) &&
+ if (CM.shouldConsiderRegPressureForVF(VF) &&
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
continue;
@@ -4286,7 +4170,33 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
if (!VPI)
continue;
switch (VPI->getOpcode()) {
- case VPInstruction::ActiveLaneMask:
+ // Selects are only modelled in the legacy cost model for safe
+ // divisors.
+ case Instruction::Select: {
+ VPValue *VPV = VPI->getVPSingleValue();
+ if (VPV->getNumUsers() == 1) {
+ if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) {
+ switch (WR->getOpcode()) {
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ continue;
+ default:
+ break;
+ }
+ }
+ }
+ C += VPI->cost(VF, CostCtx);
+ break;
+ }
+ case VPInstruction::ActiveLaneMask: {
+ unsigned Multiplier =
+ cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
+ ->getZExtValue();
+ C += VPI->cost(VF * Multiplier, CostCtx);
+ break;
+ }
case VPInstruction::ExplicitVectorLength:
C += VPI->cost(VF, CostCtx);
break;
@@ -4511,7 +4421,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
}
if (Result.Width.isScalar() ||
- isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
+ isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
+ /*IsEpilogue*/ true))
Result = NextVF;
}
@@ -5326,8 +5237,11 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
- const Value *Ptr = getLoadStorePointerOperand(I);
- Type *PtrTy = toVectorTy(Ptr->getType(), VF);
+ Value *Ptr = getLoadStorePointerOperand(I);
+ Type *PtrTy = Ptr->getType();
+
+ if (!Legal->isUniform(Ptr, VF))
+ PtrTy = toVectorTy(PtrTy, VF);
return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
@@ -5483,7 +5397,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
TTI::CastContextHint::None, CostKind, RedOp);
InstructionCost RedCost = TTI.getMulAccReductionCost(
- IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
+ IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
+ CostKind);
if (RedCost.isValid() &&
RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
@@ -5528,7 +5443,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
InstructionCost RedCost = TTI.getMulAccReductionCost(
- IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
+ IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
+ CostKind);
InstructionCost ExtraExtCost = 0;
if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
@@ -5547,7 +5463,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
InstructionCost RedCost = TTI.getMulAccReductionCost(
- true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
+ true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
+ CostKind);
if (RedCost.isValid() && RedCost < MulCost + BaseCost)
return I == RetI ? RedCost : 0;
@@ -6262,10 +6179,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
assert(Op0->getType()->getScalarSizeInBits() == 1 &&
Op1->getType()->getScalarSizeInBits() == 1);
- SmallVector<const Value *, 2> Operands{Op0, Op1};
return TTI.getArithmeticInstrCost(
- match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
- CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
+ match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
+ VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I);
}
Type *CondTy = SI->getCondition()->getType();
@@ -6495,7 +6411,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}))
continue;
VecValuesToIgnore.insert(Op);
- DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
+ append_range(DeadInterleavePointerOps, Op->operands());
}
for (const auto &[_, Ops] : DeadInvariantStoreOps)
@@ -6555,7 +6471,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
ValuesToIgnore.insert(Op);
VecValuesToIgnore.insert(Op);
- DeadOps.append(Op->op_begin(), Op->op_end());
+ append_range(DeadOps, Op->operands());
}
// Ignore type-promoting instructions we identified during reduction
@@ -6765,9 +6681,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
ElementCount VF) const {
- if (ForceTargetInstructionCost.getNumOccurrences())
- return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
- return CM.getInstructionCost(UI, VF);
+ InstructionCost Cost = CM.getInstructionCost(UI, VF);
+ if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
+ return InstructionCost(ForceTargetInstructionCost);
+ return Cost;
}
bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
@@ -7071,8 +6988,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
P->vectorFactors().end());
SmallVector<VPRegisterUsage, 8> RUs;
- if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
- CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+ if (any_of(VFs, [this](ElementCount VF) {
+ return CM.shouldConsiderRegPressureForVF(VF);
+ }))
RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
for (unsigned I = 0; I < VFs.size(); I++) {
@@ -7098,7 +7016,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
InstructionCost Cost = cost(*P, VF);
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
- if (CM.shouldCalculateRegPressureForVF(VF) &&
+ if (CM.shouldConsiderRegPressureForVF(VF) &&
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
<< VF << " because it uses too many registers\n");
@@ -7146,40 +7064,6 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
return BestFactor;
}
-static void addRuntimeUnrollDisableMetaData(Loop *L) {
- SmallVector<Metadata *, 4> MDs;
- // Reserve first location for self reference to the LoopID metadata node.
- MDs.push_back(nullptr);
- bool IsUnrollMetadata = false;
- MDNode *LoopID = L->getLoopID();
- if (LoopID) {
- // First find existing loop unrolling disable metadata.
- for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
- auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
- if (MD) {
- const auto *S = dyn_cast<MDString>(MD->getOperand(0));
- IsUnrollMetadata =
- S && S->getString().starts_with("llvm.loop.unroll.disable");
- }
- MDs.push_back(LoopID->getOperand(I));
- }
- }
-
- if (!IsUnrollMetadata) {
- // Add runtime unroll disable metadata.
- LLVMContext &Context = L->getHeader()->getContext();
- SmallVector<Metadata *, 1> DisableOperands;
- DisableOperands.push_back(
- MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
- MDNode *DisableNode = MDNode::get(Context, DisableOperands);
- MDs.push_back(DisableNode);
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
- L->setLoopID(NewLoopID);
- }
-}
-
static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
using namespace VPlanPatternMatch;
assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
@@ -7193,7 +7077,7 @@ static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
// from the main vector loop.
static void fixReductionScalarResumeWhenVectorizingEpilog(
- VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) {
+ VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
// Get the VPInstruction computing the reduction result in the middle block.
// The first operand may not be from the middle block if it is not connected
// to the scalar preheader. In that case, there's nothing to fix.
@@ -7248,8 +7132,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
// When fixing reductions in the epilogue loop we should already have
// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
// over the incoming values correctly.
- auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiR, true));
- EpiResumePhi->setIncomingValueForBlock(
+ EpiResumePhi.setIncomingValueForBlock(
BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
}
@@ -7276,11 +7159,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan, BestVF, VScale);
}
- if (!VectorizingEpilogue) {
- // Checks are the same for all VPlans, added to BestVPlan only for
- // compactness.
- attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
- }
+ // Checks are the same for all VPlans, added to BestVPlan only for
+ // compactness.
+ attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
// Retrieving VectorPH now when it's easier while VPlan still has Regions.
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
@@ -7291,6 +7172,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::narrowInterleaveGroups(
BestVPlan, BestVF,
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
+ VPlanTransforms::cse(BestVPlan);
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -7327,8 +7209,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
- BasicBlock *EntryBB =
- cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(),
State.CFG.PrevBB->getSingleSuccessor());
@@ -7342,7 +7222,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// looked through single-entry phis.
ScalarEvolution &SE = *PSE.getSE();
for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
- if (Exit->getNumPredecessors() == 0)
+ if (!Exit->hasPredecessors())
continue;
for (VPRecipeBase &PhiR : Exit->phis())
SE.forgetLcssaPhiWithNewPredecessor(
@@ -7362,88 +7242,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
//
//===------------------------------------------------===//
- // Move check blocks to their final position.
- // TODO: Move as part of VPIRBB execute and update impacted tests.
- if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
- MemCheckBlock->moveAfter(EntryBB);
- if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
- SCEVCheckBlock->moveAfter(EntryBB);
-
BestVPlan.execute(&State);
- // 2.5 When vectorizing the epilogue, fix reduction resume values from the
- // additional bypass block.
- if (VectorizingEpilogue) {
- assert(!BestVPlan.hasEarlyExit() &&
- "Epilogue vectorisation not yet supported with early exits");
- BasicBlock *PH = OrigLoop->getLoopPreheader();
- BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
- for (auto *Pred : predecessors(PH)) {
- for (PHINode &Phi : PH->phis()) {
- if (Phi.getBasicBlockIndex(Pred) != -1)
- continue;
- Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
- }
- }
- VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
- if (ScalarPH->getNumPredecessors() > 0) {
- // If ScalarPH has predecessors, we may need to update its reduction
- // resume values.
- for (VPRecipeBase &R : ScalarPH->phis()) {
- fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), State,
- BypassBlock);
- }
- }
- }
-
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
- if (HeaderVPBB) {
- MDNode *OrigLoopID = OrigLoop->getLoopID();
-
- std::optional<MDNode *> VectorizedLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
- LLVMLoopVectorizeFollowupVectorized});
-
- Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
- if (VectorizedLoopID) {
- L->setLoopID(*VectorizedLoopID);
- } else {
- // Keep all loop hints from the original loop on the vector loop (we'll
- // replace the vectorizer-specific hints below).
- if (MDNode *LID = OrigLoop->getLoopID())
- L->setLoopID(LID);
-
- LoopVectorizeHints Hints(L, true, *ORE);
- Hints.setAlreadyVectorized();
-
- // Check if it's EVL-vectorized and mark the corresponding metadata.
- bool IsEVLVectorized =
- llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) {
- // Looking for the ExplictVectorLength VPInstruction.
- if (const auto *VI = dyn_cast<VPInstruction>(&Recipe))
- return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
- return false;
- });
- if (IsEVLVectorized) {
- LLVMContext &Context = L->getHeader()->getContext();
- MDNode *LoopID = L->getLoopID();
- auto *IsEVLVectorizedMD = MDNode::get(
- Context,
- {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"),
- MDString::get(Context, "evl")});
- MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {},
- {IsEVLVectorizedMD});
- L->setLoopID(NewLoopID);
- }
- }
- TargetTransformInfo::UnrollingPreferences UP;
- TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
- if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
- addRuntimeUnrollDisableMetaData(L);
- }
+ // Add metadata to disable runtime unrolling a scalar loop when there
+ // are no runtime checks about strides and memory. A scalar loop that is
+ // rarely used is not worth unrolling.
+ bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
+ updateLoopMetadataAndProfileInfo(
+ HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
+ : nullptr,
+ HeaderVPBB, VectorizingEpilogue,
+ estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()),
+ DisableRuntimeUnroll);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
@@ -7460,15 +7274,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("");
+BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() {
+ BasicBlock *ScalarPH = createScalarPreheader("");
+ BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
// Generate the code to check the minimum iteration count of the vector
// epilogue (see below).
EPI.EpilogueIterationCountCheck =
- emitIterationCountCheck(LoopScalarPreHeader, true);
+ emitIterationCountCheck(VectorPH, ScalarPH, true);
EPI.EpilogueIterationCountCheck->setName("iter.check");
+ VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator())
+ ->getSuccessor(1);
// Generate the iteration count check for the main loop, *after* the check
// for the epilogue loop, so that the path-length is shorter for the case
// that goes directly through the vector epilogue. The longer-path length for
@@ -7476,9 +7293,10 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
// trip count. Note: the branch will get updated later on when we vectorize
// the epilogue.
EPI.MainLoopIterationCountCheck =
- emitIterationCountCheck(LoopScalarPreHeader, false);
+ emitIterationCountCheck(VectorPH, ScalarPH, false);
- return LoopVectorPreHeader;
+ return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator())
+ ->getSuccessor(1);
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7498,35 +7316,33 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
});
}
-BasicBlock *
-EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
- bool ForEpilogue) {
+BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
+ BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
assert(Bypass && "Expected valid bypass basic block.");
Value *Count = getTripCount();
MinProfitableTripCount = ElementCount::getFixed(0);
- Value *CheckMinIters =
- createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
- ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
+ Value *CheckMinIters = createIterationCountCheck(
+ VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
+ ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+ BasicBlock *const TCCheckBlock = VectorPH;
if (!ForEpilogue)
TCCheckBlock->setName("vector.main.loop.iter.check");
// Create new preheader for vector loop.
- LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
- static_cast<DominatorTree *>(nullptr), LI,
- nullptr, "vector.ph");
+ VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
+ static_cast<DominatorTree *>(nullptr), LI, nullptr,
+ "vector.ph");
if (ForEpilogue) {
// Save the trip count so we don't have to regenerate it in the
// vec.epilog.iter.check. This is safe to do because the trip count
// generated here dominates the vector epilog iter check.
EPI.TripCount = Count;
} else {
- VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
}
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
+ BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
@@ -7546,19 +7362,18 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *
-EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("vec.epilog.");
-
+BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
+ BasicBlock *ScalarPH = createScalarPreheader("vec.epilog.");
+ BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
// Now, compare the remaining count and if there aren't enough iterations to
// execute the vectorized epilogue skip to the scalar part.
- LoopVectorPreHeader->setName("vec.epilog.ph");
+ VectorPH->setName("vec.epilog.ph");
BasicBlock *VecEpilogueIterationCountCheck =
- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
- nullptr, "vec.epilog.iter.check", true);
- VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ SplitBlock(VectorPH, VectorPH->begin(), DT, LI, nullptr,
+ "vec.epilog.iter.check", true);
+ VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
- emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
+ emitMinimumVectorEpilogueIterCountCheck(VectorPH, ScalarPH,
VecEpilogueIterationCountCheck);
AdditionalBypassBlock = VecEpilogueIterationCountCheck;
@@ -7567,23 +7382,22 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
"expected this to be saved from the previous pass.");
EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, LoopVectorPreHeader);
+ VecEpilogueIterationCountCheck, VectorPH);
EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ VecEpilogueIterationCountCheck, ScalarPH);
// Adjust the terminators of runtime check blocks and phis using them.
BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
if (SCEVCheckBlock)
SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ VecEpilogueIterationCountCheck, ScalarPH);
if (MemCheckBlock)
MemCheckBlock->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ VecEpilogueIterationCountCheck, ScalarPH);
- DT->changeImmediateDominator(LoopScalarPreHeader,
- EPI.EpilogueIterationCountCheck);
+ DT->changeImmediateDominator(ScalarPH, EPI.EpilogueIterationCountCheck);
// The vec.epilog.iter.check block may contain Phi nodes from inductions or
// reductions which merge control-flow from the latch block and the middle
@@ -7592,7 +7406,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
for (PHINode *Phi : PhisInBlock) {
- Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
+ Phi->moveBefore(VectorPH->getFirstNonPHIIt());
Phi->replaceIncomingBlockWith(
VecEpilogueIterationCountCheck->getSinglePredecessor(),
VecEpilogueIterationCountCheck);
@@ -7612,12 +7426,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
Phi->removeIncomingValue(MemCheckBlock);
}
- return LoopVectorPreHeader;
+ return VectorPH;
}
BasicBlock *
EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
- BasicBlock *Bypass, BasicBlock *Insert) {
+ BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert) {
assert(EPI.TripCount &&
"Expected trip count to have been saved in the first pass.");
@@ -7637,23 +7451,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
EPI.EpilogueVF, EPI.EpilogueUF),
"min.epilog.iters.check");
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
- auto VScale = Cost->getVScaleForTuning();
- unsigned MainLoopStep =
- estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
- unsigned EpilogueLoopStep =
- estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
- // We assume the remaining `Count` is equally distributed in
- // [0, MainLoopStep)
- // So the probability for `Count < EpilogueLoopStep` should be
- // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
- unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
- const uint32_t Weights[] = {EstimatedSkipCount,
- MainLoopStep - EstimatedSkipCount};
- setBranchWeights(BI, Weights, /*IsExpected=*/false);
- }
+ BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
+ auto VScale = Cost->getVScaleForTuning();
+ unsigned MainLoopStep =
+ estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
+ unsigned EpilogueLoopStep =
+ estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
+ // We assume the remaining `Count` is equally distributed in
+ // [0, MainLoopStep)
+ // So the probability for `Count < EpilogueLoopStep` should be
+ // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
+ // TODO: Improve the estimate by taking the estimated trip count into
+ // consideration.
+ unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
+ const uint32_t Weights[] = {EstimatedSkipCount,
+ MainLoopStep - EstimatedSkipCount};
+ setBranchWeights(BI, Weights, /*IsExpected=*/false);
ReplaceInstWithInst(Insert->getTerminator(), &BI);
// A new entry block has been created for the epilogue VPlan. Hook it in, as
@@ -8634,8 +8447,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
return !CM.requiresScalarEpilogue(VF.isVector());
},
Range);
- VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(),
- Range);
+ VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
CM.foldTailByMasking());
@@ -8761,10 +8573,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPRecipeBase *Recipe =
RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
- if (!Recipe) {
- SmallVector<VPValue *, 4> Operands(R.operands());
- Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
- }
+ if (!Recipe)
+ Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range);
RecipeBuilder.setRecipe(Instr, Recipe);
if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
@@ -8790,7 +8600,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// to remove the need to keep a map of masks beyond the predication
// transform.
RecipeBuilder.updateBlockMaskCache(Old2New);
- for (const auto &[Old, _] : Old2New)
+ for (VPValue *Old : Old2New.keys())
Old->getDefiningRecipe()->eraseFromParent();
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
@@ -8851,41 +8661,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
InterleaveGroups, RecipeBuilder,
CM.isScalarEpilogueAllowed());
- // Replace VPValues for known constant strides guaranteed by predicate scalar
- // evolution.
- auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
- auto *R = cast<VPRecipeBase>(&U);
- return R->getParent()->getParent() ||
- R->getParent() ==
- Plan->getVectorLoopRegion()->getSinglePredecessor();
- };
- for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
- auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
- auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
- // Only handle constant strides for now.
- if (!ScevStride)
- continue;
-
- auto *CI = Plan->getOrAddLiveIn(
- ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
- if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
- StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
-
- // The versioned value may not be used in the loop directly but through a
- // sext/zext. Add new live-ins in those cases.
- for (Value *U : StrideV->users()) {
- if (!isa<SExtInst, ZExtInst>(U))
- continue;
- VPValue *StrideVPV = Plan->getLiveIn(U);
- if (!StrideVPV)
- continue;
- unsigned BW = U->getType()->getScalarSizeInBits();
- APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
- : ScevStride->getAPInt().zext(BW);
- VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
- StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
- }
- }
+ // Replace VPValues for known constant strides.
+ VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
+ Legal->getLAI()->getSymbolicStrides());
auto BlockNeedsPredication = [this](BasicBlock *BB) {
return Legal->blockNeedsPredication(BB);
@@ -8926,7 +8704,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
OrigLoop, *LI, Legal->getWidestInductionType(),
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
VPlanTransforms::handleEarlyExits(*Plan,
- /*HasUncountableExit*/ false, Range);
+ /*HasUncountableExit*/ false);
VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
/*TailFolded*/ false);
@@ -9316,7 +9094,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
void LoopVectorizationPlanner::attachRuntimeChecks(
VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
- if (SCEVCheckBlock) {
+ if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
assert((!CM.OptForSize ||
CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
"Cannot SCEV check stride or overflow when optimizing for size");
@@ -9324,7 +9102,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
HasBranchWeights);
}
const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
- if (MemCheckBlock) {
+ if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
// VPlan-native path does not do any analysis for runtime checks
// currently.
assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
@@ -9350,6 +9128,29 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
}
}
+void LoopVectorizationPlanner::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const {
+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
+ // an overflow to zero when updating induction variables and so an
+ // additional overflow check is required before entering the vector loop.
+ bool IsIndvarOverflowCheckNeededForVF =
+ VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
+ !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
+ CM.getTailFoldingStyle() !=
+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+ const uint32_t *BranchWeigths =
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
+ ? &MinItersBypassWeights[0]
+ : nullptr;
+ VPlanTransforms::addMinimumIterationCheck(
+ Plan, VF, UF, MinProfitableTripCount,
+ CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
+ IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
+ OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
+ *PSE.getSE());
+}
+
void VPDerivedIVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
@@ -9465,17 +9266,18 @@ static bool processLoopInVPlanNativePath(
{
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM,
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
- LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
+ VF.MinProfitableTripCount);
+
+ LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false);
}
reportVectorization(ORE, L, VF, 1);
- // Mark the loop as already vectorized to avoid vectorizing again.
- Hints.setAlreadyVectorized();
assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
return true;
}
@@ -9929,6 +9731,43 @@ static Value *createInductionAdditionalBypassValues(
return EndValueFromAdditionalBypass;
}
+static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
+ VPlan &BestEpiPlan,
+ LoopVectorizationLegality &LVL,
+ const SCEV2ValueTy &ExpandedSCEVs,
+ Value *MainVectorTripCount) {
+ // Fix reduction resume values from the additional bypass block.
+ BasicBlock *PH = L->getLoopPreheader();
+ for (auto *Pred : predecessors(PH)) {
+ for (PHINode &Phi : PH->phis()) {
+ if (Phi.getBasicBlockIndex(Pred) != -1)
+ continue;
+ Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
+ }
+ }
+ auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
+ if (ScalarPH->hasPredecessors()) {
+ // If ScalarPH has predecessors, we may need to update its reduction
+ // resume values.
+ for (const auto &[R, IRPhi] :
+ zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) {
+ fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), IRPhi,
+ BypassBlock);
+ }
+ }
+
+ // Fix induction resume values from the additional bypass block.
+ IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
+ for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+ Value *V = createInductionAdditionalBypassValues(
+ IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
+ LVL.getPrimaryInduction());
+ // TODO: Directly add as extra operand to the VPResumePHI recipe.
+ Inc->setIncomingValueForBlock(BypassBlock, V);
+ }
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->isInnermost()) &&
"VPlan-native path is not enabled. Only process inner loops.");
@@ -9971,7 +9810,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements;
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
- &Requirements, &Hints, DB, AC, BFI, PSI);
+ &Requirements, &Hints, DB, AC, BFI, PSI, AA);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
Hints.emitRemarkWithHints();
@@ -9985,6 +9824,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
+ if (!LVL.getPotentiallyFaultingLoads().empty()) {
+ reportVectorizationFailure("Auto-vectorization of loops with potentially "
+ "faulting load is not supported",
+ "PotentiallyFaultingLoadsNotSupported", ORE, L);
+ return false;
+ }
+
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
// even evaluating whether vectorization is profitable. Since we cannot modify
@@ -10251,128 +10097,80 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
- bool DisableRuntimeUnroll = false;
- MDNode *OrigLoopID = L->getLoopID();
- {
+ // Report the vectorization decision.
+ if (VF.Width.isScalar()) {
using namespace ore;
- if (!VectorizeLoop) {
- assert(IC > 1 && "interleave count should not be 1 or 0");
- // If we decided that it is not legal to vectorize the loop, then
- // interleave it.
- VPlan &BestPlan = LVP.getPlanFor(VF.Width);
- InnerLoopVectorizer Unroller(
- L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1),
- ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
-
- // TODO: Move to general VPlan pipeline once epilogue loops are also
- // supported.
- VPlanTransforms::runPass(
- VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
- VF.Width, IC, PSE);
-
- LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
+ assert(IC > 1);
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")";
+ });
+ } else {
+ // Report the vectorization decision.
+ reportVectorization(ORE, L, VF, IC);
+ }
+ if (ORE->allowExtraAnalysis(LV_NAME))
+ checkMixedPrecision(L, ORE);
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
- L->getHeader())
- << "interleaved loop (interleaved count: "
- << NV("InterleaveCount", IC) << ")";
- });
- } else {
- // If we decided that it is *legal* to vectorize the loop, then do it.
-
- VPlan &BestPlan = LVP.getPlanFor(VF.Width);
- // Consider vectorizing the epilogue too if it's profitable.
- VectorizationFactor EpilogueVF =
- LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
- if (EpilogueVF.Width.isVector()) {
- std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
-
- // The first pass vectorizes the main loop and creates a scalar epilogue
- // to be vectorized by executing the plan (potentially with a different
- // factor) again shortly afterwards.
- VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
- BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
- preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
- EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
- BestEpiPlan);
- EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
- BFI, PSI, Checks, *BestMainPlan);
- auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
- *BestMainPlan, MainILV, DT, false);
- ++LoopsVectorized;
-
- // Second pass vectorizes the epilogue and adjusts the control flow
- // edges from the first pass.
- EpilogueVectorizerEpilogueLoop EpilogILV(
- L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan);
- EpilogILV.setTripCount(MainILV.getTripCount());
- preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
-
- LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
- DT, true);
-
- // Fix induction resume values from the additional bypass block.
- BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
- IRBuilder<> BypassBuilder(BypassBlock,
- BypassBlock->getFirstInsertionPt());
- BasicBlock *PH = L->getLoopPreheader();
- for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
- auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
- Value *V = createInductionAdditionalBypassValues(
- IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount,
- LVL.getPrimaryInduction());
- // TODO: Directly add as extra operand to the VPResumePHI recipe.
- Inc->setIncomingValueForBlock(BypassBlock, V);
- }
- ++LoopsEpilogueVectorized;
+ // If we decided that it is *legal* to interleave or vectorize the loop, then
+ // do it.
- if (!Checks.hasChecks())
- DisableRuntimeUnroll = true;
- } else {
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width,
- VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
- Checks, BestPlan);
- // TODO: Move to general VPlan pipeline once epilogue loops are also
- // supported.
- VPlanTransforms::runPass(
- VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
- VF.Width, IC, PSE);
-
- LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
- ++LoopsVectorized;
-
- // Add metadata to disable runtime unrolling a scalar loop when there
- // are no runtime checks about strides and memory. A scalar loop that is
- // rarely used is not worth unrolling.
- if (!Checks.hasChecks())
- DisableRuntimeUnroll = true;
- }
- // Report the vectorization decision.
- reportVectorization(ORE, L, VF, IC);
- }
+ VPlan &BestPlan = LVP.getPlanFor(VF.Width);
+ // Consider vectorizing the epilogue too if it's profitable.
+ VectorizationFactor EpilogueVF =
+ LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
+ if (EpilogueVF.Width.isVector()) {
+ std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
+
+ // The first pass vectorizes the main loop and creates a scalar epilogue
+ // to be vectorized by executing the plan (potentially with a different
+ // factor) again shortly afterwards.
+ VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+ BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
+ preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
+ EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
+ BestEpiPlan);
+ EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI,
+ PSI, Checks, *BestMainPlan);
+ auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
+ *BestMainPlan, MainILV, DT, false);
+ ++LoopsVectorized;
+
+ // Second pass vectorizes the epilogue and adjusts the control flow
+ // edges from the first pass.
+ EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
+ BFI, PSI, Checks, BestEpiPlan);
+ EpilogILV.setTripCount(MainILV.getTripCount());
+ preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
+
+ LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
+ true);
+
+ fixScalarResumeValuesFromBypass(EpilogILV.getAdditionalBypassBlock(), L,
+ BestEpiPlan, LVL, ExpandedSCEVs,
+ EPI.VectorTripCount);
+ ++LoopsEpilogueVectorized;
+ } else {
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI,
+ Checks, BestPlan);
+ // TODO: Move to general VPlan pipeline once epilogue loops are also
+ // supported.
+ VPlanTransforms::runPass(
+ VPlanTransforms::materializeConstantVectorTripCount, BestPlan, VF.Width,
+ IC, PSE);
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
+ VF.MinProfitableTripCount);
- if (ORE->allowExtraAnalysis(LV_NAME))
- checkMixedPrecision(L, ORE);
+ LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+ ++LoopsVectorized;
}
assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
"DT not preserved correctly");
+ assert(!verifyFunction(*F, &dbgs()));
- std::optional<MDNode *> RemainderLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
- LLVMLoopVectorizeFollowupEpilogue});
- if (RemainderLoopID) {
- L->setLoopID(*RemainderLoopID);
- } else {
- if (DisableRuntimeUnroll)
- addRuntimeUnrollDisableMetaData(L);
-
- // Mark the loop as already vectorized to avoid vectorizing again.
- Hints.setAlreadyVectorized();
- }
-
- assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
return true;
}
@@ -10449,6 +10247,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
DB = &AM.getResult<DemandedBitsAnalysis>(F);
ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
LAIs = &AM.getResult<LoopAccessAnalysis>(F);
+ AA = &AM.getResult<AAManager>(F);
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 37dc41413966..6a56dbfaa015 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -967,9 +967,7 @@ class BinOpSameOpcodeHelper {
return false;
}
bool equal(unsigned Opcode) {
- if (Opcode == I->getOpcode())
- return trySet(MainOpBIT, MainOpBIT);
- return false;
+ return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
}
unsigned getOpcode() const {
MaskType Candidate = Mask & SeenBefore;
@@ -5576,7 +5574,23 @@ private:
if (auto *SD = dyn_cast<ScheduleData>(Data)) {
SD->setScheduled(/*Scheduled=*/true);
LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
- ProcessBundleMember(SD, {});
+ SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles;
+ SmallVector<ScheduleBundle *> Bundles;
+ Instruction *In = SD->getInst();
+ if (R.isVectorized(In)) {
+ ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
+ for (TreeEntry *TE : Entries) {
+ if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(In) &&
+ In->getNumOperands() != TE->getNumOperands())
+ continue;
+ auto &BundlePtr =
+ PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
+ BundlePtr->setTreeEntry(TE);
+ BundlePtr->add(SD);
+ Bundles.push_back(BundlePtr.get());
+ }
+ }
+ ProcessBundleMember(SD, Bundles);
} else {
ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
Bundle.setScheduled(/*Scheduled=*/true);
@@ -6325,17 +6339,11 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
}
/// Checks if the provided list of pointers \p Pointers represents the strided
-/// pointers for type ElemTy. If they are not, std::nullopt is returned.
-/// Otherwise, if \p Inst is not specified, just initialized optional value is
-/// returned to show that the pointers represent strided pointers. If \p Inst
-/// specified, the runtime stride is materialized before the given \p Inst.
-/// \returns std::nullopt if the pointers are not pointers with the runtime
-/// stride, nullptr or actual stride value, otherwise.
-static std::optional<Value *>
-calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
- const DataLayout &DL, ScalarEvolution &SE,
- SmallVectorImpl<unsigned> &SortedIndices,
- Instruction *Inst = nullptr) {
+/// pointers for type ElemTy. If they are not, nullptr is returned.
+/// Otherwise, SCEV* of the stride value is returned.
+static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &SortedIndices) {
SmallVector<const SCEV *> SCEVs;
const SCEV *PtrSCEVLowest = nullptr;
const SCEV *PtrSCEVHighest = nullptr;
@@ -6344,7 +6352,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
for (Value *Ptr : PointerOps) {
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
if (!PtrSCEV)
- return std::nullopt;
+ return nullptr;
SCEVs.push_back(PtrSCEV);
if (!PtrSCEVLowest && !PtrSCEVHighest) {
PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
@@ -6352,14 +6360,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
}
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Diff))
- return std::nullopt;
+ return nullptr;
if (Diff->isNonConstantNegative()) {
PtrSCEVLowest = PtrSCEV;
continue;
}
const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
if (isa<SCEVCouldNotCompute>(Diff1))
- return std::nullopt;
+ return nullptr;
if (Diff1->isNonConstantNegative()) {
PtrSCEVHighest = PtrSCEV;
continue;
@@ -6368,7 +6376,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
// Dist = PtrSCEVHighest - PtrSCEVLowest;
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Dist))
- return std::nullopt;
+ return nullptr;
int Size = DL.getTypeStoreSize(ElemTy);
auto TryGetStride = [&](const SCEV *Dist,
const SCEV *Multiplier) -> const SCEV * {
@@ -6389,10 +6397,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
Stride = TryGetStride(Dist, Sz);
if (!Stride)
- return std::nullopt;
+ return nullptr;
}
if (!Stride || isa<SCEVConstant>(Stride))
- return std::nullopt;
+ return nullptr;
// Iterate through all pointers and check if all distances are
// unique multiple of Stride.
using DistOrdPair = std::pair<int64_t, int>;
@@ -6406,28 +6414,28 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
const SCEV *Coeff = TryGetStride(Diff, Stride);
if (!Coeff)
- return std::nullopt;
+ return nullptr;
const auto *SC = dyn_cast<SCEVConstant>(Coeff);
if (!SC || isa<SCEVCouldNotCompute>(SC))
- return std::nullopt;
+ return nullptr;
if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
SE.getMulExpr(Stride, SC)))
->isZero())
- return std::nullopt;
+ return nullptr;
Dist = SC->getAPInt().getZExtValue();
}
// If the strides are not the same or repeated, we can't vectorize.
if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
- return std::nullopt;
+ return nullptr;
auto Res = Offsets.emplace(Dist, Cnt);
if (!Res.second)
- return std::nullopt;
+ return nullptr;
// Consecutive order if the inserted element is the last one.
IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
++Cnt;
}
if (Offsets.size() != SCEVs.size())
- return std::nullopt;
+ return nullptr;
SortedIndices.clear();
if (!IsConsecutive) {
// Fill SortedIndices array only if it is non-consecutive.
@@ -6438,10 +6446,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
++Cnt;
}
}
- if (!Inst)
- return nullptr;
- SCEVExpander Expander(SE, DL, "strided-load-vec");
- return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
+ return Stride;
}
static std::pair<InstructionCost, InstructionCost>
@@ -8030,11 +8035,11 @@ void BoUpSLP::reorderTopToBottom() {
// it is an attempt to reorder node with reused scalars but with
// external uses.
if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
- OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
+ OrdersUses.try_emplace(OrdersType(), 0).first->second +=
ExternalUserReorderIndices.size();
} else {
for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
- ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
+ ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
}
// No other useful reorder data in this entry.
if (Order.empty())
@@ -8054,9 +8059,9 @@ void BoUpSLP::reorderTopToBottom() {
return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
- ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+ ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
} else {
- ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+ ++OrdersUses.try_emplace(Order, 0).first->second;
}
}
if (OrdersUses.empty())
@@ -8480,12 +8485,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
- OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
- NumOps;
+ OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
} else {
- OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
+ OrdersUses.try_emplace(Order, 0).first->second += NumOps;
}
- auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
+ auto Res = OrdersUses.try_emplace(OrdersType(), 0);
const auto AllowsReordering = [&](const TreeEntry *TE) {
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -10639,8 +10643,19 @@ class InstructionsCompatibilityAnalysis {
}
}
}
- if (MainOp)
+ if (MainOp) {
+ // Do not match, if any copyable is a terminator from the same block as
+ // the main operation.
+ if (any_of(VL, [&](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && I->getParent() == MainOp->getParent() &&
+ I->isTerminator();
+ })) {
+ MainOp = nullptr;
+ return;
+ }
MainOpcode = MainOp->getOpcode();
+ }
}
/// Returns the idempotent value for the \p MainOp with the detected \p
@@ -11013,7 +11028,10 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
}
SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
if (all_of(VL, [&](Value *V) {
- return isa<PoisonValue>(V) || Values.contains(V);
+ return isa<PoisonValue>(V) || Values.contains(V) ||
+ (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
+ LI->getLoopFor(S.getMainOp()->getParent()) &&
+ isVectorized(V));
})) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
@@ -17835,6 +17853,17 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
}
+ Value *getVectorizedValue(const TreeEntry &E) {
+ Value *Vec = E.VectorizedValue;
+ if (!Vec->getType()->isIntOrIntVectorTy())
+ return Vec;
+ return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
+ return !isa<PoisonValue>(V) &&
+ !isKnownNonNegative(
+ V, SimplifyQuery(*R.DL));
+ }));
+ }
+
public:
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
: BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
@@ -18001,35 +18030,14 @@ public:
/// Adds 2 input vectors (in form of tree entries) and the mask for their
/// shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
- Value *V1 = E1.VectorizedValue;
- if (V1->getType()->isIntOrIntVectorTy())
- V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return false;
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
- Value *V2 = E2.VectorizedValue;
- if (V2->getType()->isIntOrIntVectorTy())
- V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return false;
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
+ Value *V1 = getVectorizedValue(E1);
+ Value *V2 = getVectorizedValue(E2);
add(V1, V2, Mask);
}
/// Adds single input vector (in form of tree entry) and the mask for its
/// shuffling.
void add(const TreeEntry &E1, ArrayRef<int> Mask) {
- Value *V1 = E1.VectorizedValue;
- if (V1->getType()->isIntOrIntVectorTy())
- V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return false;
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
+ Value *V1 = getVectorizedValue(E1);
add(V1, Mask);
}
/// Adds 2 input vectors and the mask for their shuffling.
@@ -18178,14 +18186,7 @@ public:
auto CreateSubVectors = [&](Value *Vec,
SmallVectorImpl<int> &CommonMask) {
for (auto [E, Idx] : SubVectors) {
- Value *V = E->VectorizedValue;
- if (V->getType()->isIntOrIntVectorTy())
- V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return false;
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
+ Value *V = getVectorizedValue(*E);
unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
// Use scalar version of the SCalarType to correctly handle shuffles
// for revectorization. The revectorization mode operates by the
@@ -19526,11 +19527,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return cast<LoadInst>(V)->getPointerOperand();
});
OrdersType Order;
- std::optional<Value *> Stride =
- calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
- &*Builder.GetInsertPoint());
+ const SCEV *StrideSCEV =
+ calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
+ assert(StrideSCEV && "At this point stride should be known");
+ SCEVExpander Expander(*SE, *DL, "strided-load-vec");
+ Value *Stride = Expander.expandCodeFor(
+ StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint());
Value *NewStride =
- Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
+ Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
StrideVal = Builder.CreateMul(
NewStride,
ConstantInt::get(
@@ -20519,7 +20523,9 @@ Value *BoUpSLP::vectorizeTree(
!(GatheredLoadsEntriesFirst.has_value() &&
IE->Idx >= *GatheredLoadsEntriesFirst &&
VectorizableTree.front()->isGather() &&
- is_contained(VectorizableTree.front()->Scalars, I)))
+ is_contained(VectorizableTree.front()->Scalars, I)) &&
+ !(!VectorizableTree.front()->isGather() &&
+ VectorizableTree.front()->isCopyableElement(I)))
continue;
SmallVector<SelectInst *> LogicalOpSelects;
I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
@@ -20782,6 +20788,14 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
continue;
}
auto *SD = cast<ScheduleData>(SE);
+ if (SD->hasValidDependencies() &&
+ (!S.areInstructionsWithCopyableElements() ||
+ !S.isCopyableElement(SD->getInst())) &&
+ !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
+ EI.UserTE->hasState() &&
+ (!EI.UserTE->hasCopyableElements() ||
+ !EI.UserTE->isCopyableElement(SD->getInst())))
+ SD->clearDirectDependencies();
for (const Use &U : SD->getInst()->operands()) {
unsigned &NumOps =
UserOpToNumOps
@@ -20791,7 +20805,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
if (auto *Op = dyn_cast<Instruction>(U.get());
Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
*SLP, NumOps)) {
- if (ScheduleData *OpSD = getScheduleData(Op)) {
+ if (ScheduleData *OpSD = getScheduleData(Op);
+ OpSD && OpSD->hasValidDependencies()) {
OpSD->clearDirectDependencies();
if (RegionHasStackSave ||
!isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
@@ -20977,7 +20992,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ScheduleCopyableDataMapByUsers.erase(I);
ScheduleCopyableDataMap.erase(KV);
// Need to recalculate dependencies for the actual schedule data.
- if (ScheduleData *OpSD = getScheduleData(I)) {
+ if (ScheduleData *OpSD = getScheduleData(I);
+ OpSD && OpSD->hasValidDependencies()) {
OpSD->clearDirectDependencies();
if (RegionHasStackSave ||
!isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
@@ -21881,6 +21897,10 @@ bool BoUpSLP::collectValuesToDemote(
return TryProcessInstruction(BitWidth);
case Instruction::ZExt:
case Instruction::SExt:
+ if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
+ E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
+ E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
+ return false;
IsProfitableToDemote = true;
return TryProcessInstruction(BitWidth);
@@ -23797,9 +23817,7 @@ public:
size_t Key, Idx;
std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
/*AllowAlternate=*/false);
- ++PossibleReducedVals[Key][Idx]
- .insert(std::make_pair(V, 0))
- .first->second;
+ ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
}
for (Instruction *I : reverse(PossibleReductionOps))
Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
@@ -23820,21 +23838,20 @@ public:
stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
return P1.size() > P2.size();
});
- int NewIdx = -1;
+ bool First = true;
for (ArrayRef<Value *> Data : PossibleRedValsVect) {
- if (NewIdx < 0 ||
- (!isGoodForReduction(Data) &&
- (!isa<LoadInst>(Data.front()) ||
- !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
- getUnderlyingObject(
- cast<LoadInst>(Data.front())->getPointerOperand()) !=
- getUnderlyingObject(
- cast<LoadInst>(ReducedVals[NewIdx].front())
- ->getPointerOperand())))) {
- NewIdx = ReducedVals.size();
+ if (First) {
+ First = false;
ReducedVals.emplace_back();
+ } else if (!isGoodForReduction(Data)) {
+ auto *LI = dyn_cast<LoadInst>(Data.front());
+ auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
+ if (!LI || !LastLI ||
+ getUnderlyingObject(LI->getPointerOperand()) !=
+ getUnderlyingObject(LastLI->getPointerOperand()))
+ ReducedVals.emplace_back();
}
- ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
+ ReducedVals.back().append(Data.rbegin(), Data.rend());
}
}
// Sort the reduced values by number of same/alternate opcode and/or pointer
@@ -23847,7 +23864,8 @@ public:
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
- const TargetLibraryInfo &TLI, AssumptionCache *AC) {
+ const TargetLibraryInfo &TLI, AssumptionCache *AC,
+ DominatorTree &DT) {
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
@@ -24164,9 +24182,7 @@ public:
// previous vectorization attempts.
if (any_of(VL, [&V](Value *RedVal) {
auto *RedValI = dyn_cast<Instruction>(RedVal);
- if (!RedValI)
- return false;
- return V.isDeleted(RedValI);
+ return RedValI && V.isDeleted(RedValI);
}))
break;
V.buildTree(VL, IgnoreList);
@@ -24248,7 +24264,7 @@ public:
// Estimate cost.
InstructionCost ReductionCost =
- getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
+ getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
<< " for reduction\n");
@@ -24553,7 +24569,9 @@ private:
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
bool IsCmpSelMinMax, FastMathFlags FMF,
- const BoUpSLP &R) {
+ const BoUpSLP &R, DominatorTree &DT,
+ const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *ScalarTy = ReducedVals.front()->getType();
unsigned ReduxWidth = ReducedVals.size();
@@ -24578,6 +24596,22 @@ private:
for (User *U : RdxVal->users()) {
auto *RdxOp = cast<Instruction>(U);
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
+ if (RdxKind == RecurKind::FAdd) {
+ InstructionCost FMACost = canConvertToFMA(
+ RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
+ if (FMACost.isValid()) {
+ LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
+ if (auto *I = dyn_cast<Instruction>(RdxVal)) {
+ // Also, exclude scalar fmul cost.
+ InstructionCost FMulCost =
+ TTI->getInstructionCost(I, CostKind);
+ LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
+ FMACost -= FMulCost;
+ }
+ ScalarCost += FMACost;
+ continue;
+ }
+ }
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
continue;
}
@@ -24642,8 +24676,45 @@ private:
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
std::make_pair(RedTy, true));
VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
- VectorCost +=
- TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
+ InstructionCost FMACost = InstructionCost::getInvalid();
+ if (RdxKind == RecurKind::FAdd) {
+ // Check if the reduction operands can be converted to FMA.
+ SmallVector<Value *> Ops;
+ FastMathFlags FMF;
+ FMF.set();
+ for (Value *RdxVal : ReducedVals) {
+ if (!RdxVal->hasOneUse()) {
+ Ops.clear();
+ break;
+ }
+ if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
+ FMF &= FPCI->getFastMathFlags();
+ Ops.push_back(RdxVal->user_back());
+ }
+ if (!Ops.empty()) {
+ FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
+ *TTI, TLI);
+ if (FMACost.isValid()) {
+ // Calculate actual FMAD cost.
+ IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
+ {RVecTy, RVecTy, RVecTy}, FMF);
+ FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
+
+ LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
+ // Also, exclude vector fmul cost.
+ InstructionCost FMulCost = TTI->getArithmeticInstrCost(
+ Instruction::FMul, RVecTy, CostKind);
+ LLVM_DEBUG(dbgs()
+ << "Minus vector FMul cost: " << FMulCost << "\n");
+ FMACost -= FMulCost;
+ }
+ }
+ }
+ if (FMACost.isValid())
+ VectorCost += FMACost;
+ else
+ VectorCost +=
+ TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
if (RType != RedTy) {
unsigned Opcode = Instruction::Trunc;
if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
@@ -25311,7 +25382,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
HorizontalReduction HorRdx;
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
- return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
+ return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
};
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25456,7 +25527,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (RedCost >= ScalarCost)
return false;
- return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+ return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
};
if (Candidates.size() == 1)
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
@@ -25540,7 +25611,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
template <typename T>
static bool tryToVectorizeSequence(
SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
- function_ref<bool(T *, T *)> AreCompatible,
+ function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
bool MaxVFOnly, BoUpSLP &R) {
bool Changed = false;
@@ -25562,7 +25633,7 @@ static bool tryToVectorizeSequence(
auto *SameTypeIt = IncIt;
while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
- AreCompatible(*SameTypeIt, *IncIt))) {
+ AreCompatible(VL, *SameTypeIt))) {
auto *I = dyn_cast<Instruction>(*SameTypeIt);
++SameTypeIt;
if (I && !R.isDeleted(I))
@@ -25760,10 +25831,10 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
return compareCmp<false>(V, V2, *TLI, *DT);
};
- auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
- if (V1 == V2)
+ auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
+ if (VL.empty() || VL.back() == V1)
return true;
- return compareCmp<true>(V1, V2, *TLI, *DT);
+ return compareCmp<true>(V1, VL.back(), *TLI, *DT);
};
SmallVector<Value *> Vals;
@@ -25969,9 +26040,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
return false;
};
- auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
- if (V1 == V2)
+ auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
+ Value *V1) {
+ if (VL.empty() || V1 == VL.back())
return true;
+ Value *V2 = VL.back();
if (V1->getType() != V2->getType())
return false;
ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
@@ -26061,7 +26134,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
InstSetVector PostProcessInserts;
SmallSetVector<CmpInst *, 8> PostProcessCmps;
- // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
+ // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
// also vectorizes `PostProcessCmps`.
auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
@@ -26342,7 +26415,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
V2->getValueOperand()->getValueID();
};
- auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
+ bool SameParent = true;
+ auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
+ if (VL.empty()) {
+ SameParent = true;
+ return true;
+ }
+ StoreInst *V2 = VL.back();
if (V1 == V2)
return true;
if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
@@ -26353,15 +26432,34 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (isa<UndefValue>(V1->getValueOperand()) ||
isa<UndefValue>(V2->getValueOperand()))
return true;
- if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
- if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
- if (I1->getParent() != I2->getParent())
- return false;
- return getSameOpcode({I1, I2}, *TLI).valid();
- }
if (isa<Constant>(V1->getValueOperand()) &&
isa<Constant>(V2->getValueOperand()))
return true;
+ // Check if the operands of the stores can be vectorized. They can be
+ // vectorized, if they have compatible operands or have operands, which can
+ // be vectorized as copyables.
+ auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
+ auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
+ if (I1 || I2) {
+ // Accept only tail-following non-compatible values for now.
+ // TODO: investigate if it is possible to vectorize incompatible values,
+ // if the copyables are first in the list.
+ if (I1 && !I2)
+ return false;
+ SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
+ SmallVector<Value *> NewVL(VL.size() + 1);
+ for (auto [SI, V] : zip(VL, NewVL))
+ V = SI->getValueOperand();
+ NewVL.back() = V1->getValueOperand();
+ InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+ InstructionsState S = Analysis.buildInstructionsState(
+ NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
+ /*SkipSameCodeCheck=*/!SameParent);
+ if (S)
+ return true;
+ if (!SameParent)
+ return false;
+ }
return V1->getValueOperand()->getValueID() ==
V2->getValueOperand()->getValueID();
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f972efa07eb7..16b1b539345d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -45,6 +45,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <cassert>
#include <string>
@@ -55,6 +56,15 @@ namespace llvm {
extern cl::opt<bool> EnableVPlanNativePath;
}
+/// @{
+/// Metadata attribute names
+const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
+const char LLVMLoopVectorizeFollowupVectorized[] =
+ "llvm.loop.vectorize.followup_vectorized";
+const char LLVMLoopVectorizeFollowupEpilogue[] =
+ "llvm.loop.vectorize.followup_epilogue";
+/// @}
+
extern cl::opt<unsigned> ForceTargetInstructionCost;
static cl::opt<bool> PrintVPlansInDotFormat(
@@ -143,7 +153,7 @@ template <typename T> static T *getPlanEntry(T *Start) {
for (unsigned i = 0; i < WorkList.size(); i++) {
T *Current = WorkList[i];
- if (Current->getNumPredecessors() == 0)
+ if (!Current->hasPredecessors())
return Current;
auto &Predecessors = Current->getPredecessors();
WorkList.insert_range(Predecessors);
@@ -216,7 +226,7 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
// If VPBB is in a region R, VPBB is a loop header if R is a loop region with
// VPBB as its entry, i.e., free of predecessors.
if (auto *R = VPBB->getParent())
- return !R->isReplicator() && VPBB->getNumPredecessors() == 0;
+ return !R->isReplicator() && !VPBB->hasPredecessors();
// A header dominates its second predecessor (the latch), with the other
// predecessor being the preheader
@@ -493,6 +503,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
void VPIRBasicBlock::execute(VPTransformState *State) {
assert(getHierarchicalSuccessors().size() <= 2 &&
"VPIRBasicBlock can have at most two successors at the moment!");
+ // Move completely disconnected blocks to their final position.
+ if (IRBB->hasNPredecessors(0) && succ_begin(IRBB) == succ_end(IRBB))
+ IRBB->moveAfter(State->CFG.PrevBB);
State->Builder.SetInsertPoint(IRBB->getTerminator());
State->CFG.PrevBB = IRBB;
State->CFG.VPBB2IRBB[this] = IRBB;
@@ -809,7 +822,7 @@ InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
const VPBlockBase *Pred = nullptr;
- if (getNumPredecessors() > 0) {
+ if (hasPredecessors()) {
Pred = getPredecessors()[Idx];
} else {
auto *Region = getParent();
@@ -1183,14 +1196,14 @@ VPlan *VPlan::duplicate() {
BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
VPIRBasicBlock *NewScalarHeader = nullptr;
- if (getScalarHeader()->getNumPredecessors() == 0) {
- NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB);
- } else {
+ if (getScalarHeader()->hasPredecessors()) {
NewScalarHeader = cast<VPIRBasicBlock>(*find_if(
vp_depth_first_shallow(NewEntry), [ScalarHeaderIRBB](VPBlockBase *VPB) {
auto *VPIRBB = dyn_cast<VPIRBasicBlock>(VPB);
return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
}));
+ } else {
+ NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB);
}
// Create VPlan, clone live-ins and remap operands in the cloned blocks.
auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader);
@@ -1473,7 +1486,7 @@ void VPSlotTracker::assignName(const VPValue *V) {
std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str();
// First assign the base name for V.
- const auto &[A, _] = VPValue2Name.insert({V, BaseName});
+ const auto &[A, _] = VPValue2Name.try_emplace(V, BaseName);
// Integer or FP constants with different types will result in he same string
// due to stripping types.
if (V->isLiveIn() && isa<ConstantInt, ConstantFP>(UV))
@@ -1481,7 +1494,7 @@ void VPSlotTracker::assignName(const VPValue *V) {
// If it is already used by C > 0 other VPValues, increase the version counter
// C and use it for V.
- const auto &[C, UseInserted] = BaseName2Version.insert({BaseName, 0});
+ const auto &[C, UseInserted] = BaseName2Version.try_emplace(BaseName, 0);
if (!UseInserted) {
C->second++;
A->second = (BaseName + Twine(".") + Twine(C->second)).str();
@@ -1612,6 +1625,123 @@ VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
llvm_unreachable("No plan found!");
}
+static void addRuntimeUnrollDisableMetaData(Loop *L) {
+ SmallVector<Metadata *, 4> MDs;
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDs.push_back(nullptr);
+ bool IsUnrollMetadata = false;
+ MDNode *LoopID = L->getLoopID();
+ if (LoopID) {
+ // First find existing loop unrolling disable metadata.
+ for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
+ auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
+ if (MD) {
+ const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+ if (!S)
+ continue;
+ if (S->getString().starts_with("llvm.loop.unroll.runtime.disable"))
+ continue;
+ IsUnrollMetadata =
+ S->getString().starts_with("llvm.loop.unroll.disable");
+ }
+ MDs.push_back(LoopID->getOperand(I));
+ }
+ }
+
+ if (!IsUnrollMetadata) {
+ // Add runtime unroll disable metadata.
+ LLVMContext &Context = L->getHeader()->getContext();
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(
+ MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+ MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+ MDs.push_back(DisableNode);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L->setLoopID(NewLoopID);
+ }
+}
+
+void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo(
+ Loop *VectorLoop, VPBasicBlock *HeaderVPBB, bool VectorizingEpilogue,
+ unsigned EstimatedVFxUF, bool DisableRuntimeUnroll) {
+ MDNode *LID = OrigLoop->getLoopID();
+ // Update the metadata of the scalar loop. Skip the update when vectorizing
+ // the epilogue loop, to ensure it is only updated once.
+ if (!VectorizingEpilogue) {
+ std::optional<MDNode *> RemainderLoopID = makeFollowupLoopID(
+ LID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue});
+ if (RemainderLoopID) {
+ OrigLoop->setLoopID(*RemainderLoopID);
+ } else {
+ if (DisableRuntimeUnroll)
+ addRuntimeUnrollDisableMetaData(OrigLoop);
+
+ LoopVectorizeHints Hints(OrigLoop, true, *ORE);
+ Hints.setAlreadyVectorized();
+ }
+ }
+
+ if (!VectorLoop)
+ return;
+
+ if (std::optional<MDNode *> VectorizedLoopID =
+ makeFollowupLoopID(LID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupVectorized})) {
+ VectorLoop->setLoopID(*VectorizedLoopID);
+ } else {
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ if (LID)
+ VectorLoop->setLoopID(LID);
+
+ if (!VectorizingEpilogue) {
+ LoopVectorizeHints Hints(VectorLoop, true, *ORE);
+ Hints.setAlreadyVectorized();
+ }
+
+ // Check if it's EVL-vectorized and mark the corresponding metadata.
+ bool IsEVLVectorized =
+ llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) {
+ // Looking for the ExplictVectorLength VPInstruction.
+ if (const auto *VI = dyn_cast<VPInstruction>(&Recipe))
+ return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
+ return false;
+ });
+ if (IsEVLVectorized) {
+ LLVMContext &Context = VectorLoop->getHeader()->getContext();
+ MDNode *LoopID = VectorLoop->getLoopID();
+ auto *IsEVLVectorizedMD = MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"),
+ MDString::get(Context, "evl")});
+ MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {},
+ {IsEVLVectorizedMD});
+ VectorLoop->setLoopID(NewLoopID);
+ }
+ }
+ TargetTransformInfo::UnrollingPreferences UP;
+ TTI.getUnrollingPreferences(VectorLoop, *PSE.getSE(), UP, ORE);
+ if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
+ addRuntimeUnrollDisableMetaData(VectorLoop);
+
+ // Set/update profile weights for the vector and remainder loops as original
+ // loop iterations are now distributed among them. Note that original loop
+ // becomes the scalar remainder loop after vectorization.
+ //
+ // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
+ // end up getting slightly roughened result but that should be OK since
+ // profile is not inherently precise anyway. Note also possible bypass of
+ // vector code caused by legality checks is ignored, assigning all the weight
+ // to the vector loop, optimistically.
+ //
+ // For scalable vectorization we can't know at compile time how many
+ // iterations of the loop are handled in one vector iteration, so instead
+ // use the value of vscale used for tuning.
+ setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
if (VPlans.empty()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d6bc462a0dfa..53291a931530 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -219,6 +219,9 @@ public:
size_t getNumSuccessors() const { return Successors.size(); }
size_t getNumPredecessors() const { return Predecessors.size(); }
+ /// Returns true if this block has any predecessors.
+ bool hasPredecessors() const { return !Predecessors.empty(); }
+
/// An Enclosing Block of a block B is any block containing B, including B
/// itself. \return the closest enclosing block starting from "this", which
/// has successors. \return the root enclosing block if all enclosing blocks
@@ -400,7 +403,7 @@ class LLVM_ABI_FOR_TEST VPRecipeBase
public:
VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPDef(SC), VPUser(Operands), DL(DL) {}
virtual ~VPRecipeBase() = default;
@@ -518,11 +521,11 @@ protected:
class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
public:
VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeBase(SC, Operands, DL), VPValue(this) {}
VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
- Value *UV, DebugLoc DL = {})
+ Value *UV, DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {}
static inline bool classof(const VPRecipeBase *R) {
@@ -557,6 +560,7 @@ public:
case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
+ case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
case VPRecipeBase::VPWidenLoadEVLSC:
@@ -712,12 +716,15 @@ public:
VPIRFlags(GEPNoWrapFlags GEPFlags)
: OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {}
-public:
void transferFlags(VPIRFlags &Other) {
OpType = Other.OpType;
AllFlags = Other.AllFlags;
}
+ /// Only keep flags also present in \p Other. \p Other must have the same
+ /// OpType as the current object.
+ void intersectFlags(const VPIRFlags &Other);
+
/// Drop all poison-generating flags.
void dropPoisonGeneratingFlags() {
// NOTE: This needs to be kept in-sync with
@@ -864,7 +871,7 @@ public:
/// using IR flags.
struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPSingleDefRecipe(SC, Operands, DL), VPIRFlags() {}
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
@@ -872,7 +879,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
: VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()), VPIRFlags(I) {}
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
- const VPIRFlags &Flags, DebugLoc DL = {})
+ const VPIRFlags &Flags,
+ DebugLoc DL = DebugLoc::getUnknown())
: VPSingleDefRecipe(SC, Operands, DL), VPIRFlags(Flags) {}
static inline bool classof(const VPRecipeBase *R) {
@@ -900,6 +908,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
return R && classof(R);
}
+ static inline bool classof(const VPSingleDefRecipe *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
+
void execute(VPTransformState &State) override = 0;
/// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.
@@ -975,6 +988,10 @@ public:
Not,
SLPLoad,
SLPStore,
+ // Creates a mask where each lane is active (true) whilst the current
+ // counter (first operand + index) is less than the second operand. i.e.
+ // mask[i] = icmpt ult (op0 + i), op1
+ // The size of the mask returned is VF * Multiplier (UF, third op).
ActiveLaneMask,
ExplicitVectorLength,
CalculateTripCountMinusVF,
@@ -1014,7 +1031,8 @@ public:
// Returns a scalar boolean value, which is true if any lane of its
// (boolean) vector operands is true. It produces the reduced value across
// all unrolled iterations. Unrolling will add all copies of its original
- // operand as additional operands.
+ // operand as additional operands. AnyOf is poison-safe as all operands
+ // will be frozen.
AnyOf,
// Calculates the first active lane index of the vector predicate operands.
// It produces the lane index across all unrolled iterations. Unrolling will
@@ -1080,13 +1098,13 @@ private:
#endif
public:
- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
- const Twine &Name = "")
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
: VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
- const VPIRFlags &Flags, DebugLoc DL = {},
+ const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "");
VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
@@ -1479,7 +1497,8 @@ public:
}
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
- const VPIRFlags &Flags = {}, DebugLoc DL = {})
+ const VPIRFlags &Flags = {},
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, Flags, DL),
VPIRMetadata(), Opcode(Opcode), ResultTy(ResultTy) {
assert(flagsValidForOpcode(Opcode) &&
@@ -1537,7 +1556,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
public:
VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
VPIRMetadata(CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
MayReadFromMemory(CI.mayReadFromMemory()),
@@ -1546,7 +1565,7 @@ public:
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
VPIRMetadata(), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
LLVMContext &Ctx = Ty->getContext();
@@ -1615,7 +1634,8 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
public:
VPWidenCallRecipe(Value *UV, Function *Variant,
- ArrayRef<VPValue *> CallArguments, DebugLoc DL = {})
+ ArrayRef<VPValue *> CallArguments,
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
*cast<Instruction>(UV)),
VPIRMetadata(*cast<Instruction>(UV)), Variant(Variant) {
@@ -1644,10 +1664,8 @@ public:
return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
}
- operand_range args() { return make_range(op_begin(), std::prev(op_end())); }
- const_operand_range args() const {
- return make_range(op_begin(), std::prev(op_end()));
- }
+ operand_range args() { return drop_end(operands()); }
+ const_operand_range args() const { return drop_end(operands()); }
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
@@ -1667,7 +1685,7 @@ class VPHistogramRecipe : public VPRecipeBase {
public:
VPHistogramRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {}
~VPHistogramRecipe() override = default;
@@ -1998,6 +2016,9 @@ public:
return getOperand(1);
}
+ /// Update the incoming value from the loop backedge.
+ void setBackedgeValue(VPValue *V) { setOperand(1, V); }
+
/// Returns the backedge value as a recipe. The backedge value is guaranteed
/// to be a recipe.
virtual VPRecipeBase &getBackedgeRecipe() {
@@ -2229,8 +2250,8 @@ protected:
public:
/// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and
/// debug location \p DL.
- VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = {},
- const Twine &Name = "")
+ VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr,
+ DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
: VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL),
Name(Name.str()) {
if (Start)
@@ -2381,9 +2402,8 @@ public:
}
VPBlendRecipe *clone() override {
- SmallVector<VPValue *> Ops(operands());
- return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), Ops,
- getDebugLoc());
+ return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()),
+ operands(), getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPBlendSC)
@@ -2409,6 +2429,12 @@ public:
return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized());
}
+ /// Set mask number \p Idx to \p V.
+ void setMask(unsigned Idx, VPValue *V) {
+ assert((Idx > 0 || !isNormalized()) && "First index has no mask!");
+ Idx == 0 ? setOperand(1, V) : setOperand(Idx * 2 + !isNormalized(), V);
+ }
+
void execute(VPTransformState &State) override {
llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends");
}
@@ -2434,12 +2460,13 @@ public:
}
};
-/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
-/// or stores into one wide load/store and shuffles. The first operand of a
-/// VPInterleave recipe is the address, followed by the stored values, followed
-/// by an optional mask.
-class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
- public VPIRMetadata {
+/// A common base class for interleaved memory operations.
+/// An Interleaved memory operation is a memory access method that combines
+/// multiple strided loads/stores into a single wide load/store with shuffles.
+/// The first operand is the start address. The optional operands are, in order,
+/// the stored values and the mask.
+class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
+ public VPIRMetadata {
const InterleaveGroup<Instruction> *IG;
/// Indicates if the interleave group is in a conditional block and requires a
@@ -2450,12 +2477,14 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
/// unusued gaps can be loaded speculatively.
bool NeedsMaskForGaps = false;
-public:
- VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
- ArrayRef<VPValue *> StoredValues, VPValue *Mask,
- bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
- : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, DL), VPIRMetadata(MD),
- IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) {
+protected:
+ VPInterleaveBase(const unsigned char SC,
+ const InterleaveGroup<Instruction> *IG,
+ ArrayRef<VPValue *> Operands,
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+ bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
+ : VPRecipeBase(SC, Operands, DL), VPIRMetadata(MD), IG(IG),
+ NeedsMaskForGaps(NeedsMaskForGaps) {
// TODO: extend the masked interleaved-group support to reversed access.
assert((!Mask || !IG->isReverse()) &&
"Reversed masked interleave-group not supported.");
@@ -2473,14 +2502,19 @@ public:
addOperand(Mask);
}
}
- ~VPInterleaveRecipe() override = default;
- VPInterleaveRecipe *clone() override {
- return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
- NeedsMaskForGaps, *this, getDebugLoc());
+public:
+ VPInterleaveBase *clone() override = 0;
+
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
+ R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC;
}
- VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
/// Return the address accessed by this recipe.
VPValue *getAddr() const {
@@ -2490,48 +2524,130 @@ public:
/// Return the mask used by this recipe. Note that a full mask is represented
/// by a nullptr.
VPValue *getMask() const {
- // Mask is optional and therefore the last, currently 2nd operand.
+ // Mask is optional and the last operand.
return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
}
+ /// Return true if the access needs a mask because of the gaps.
+ bool needsMaskForGaps() const { return NeedsMaskForGaps; }
+
+ const InterleaveGroup<Instruction> *getInterleaveGroup() const { return IG; }
+
+ Instruction *getInsertPos() const { return IG->getInsertPos(); }
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPInterleaveBase should not be instantiated.");
+ }
+
+ /// Return the cost of this recipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ virtual bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+
+ /// Returns the number of stored operands of this interleave group. Returns 0
+ /// for load interleave groups.
+ virtual unsigned getNumStoreOperands() const = 0;
+
/// Return the VPValues stored by this interleave group. If it is a load
/// interleave group, return an empty ArrayRef.
ArrayRef<VPValue *> getStoredValues() const {
- // The first operand is the address, followed by the stored values, followed
- // by an optional mask.
- return ArrayRef<VPValue *>(op_begin(), getNumOperands())
- .slice(1, getNumStoreOperands());
+ return ArrayRef<VPValue *>(op_end() -
+ (getNumStoreOperands() + (HasMask ? 1 : 0)),
+ getNumStoreOperands());
+ }
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
+/// or stores into one wide load/store and shuffles. The first operand of a
+/// VPInterleave recipe is the address, followed by the stored values, followed
+/// by an optional mask.
+class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
+public:
+ VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+ bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
+ : VPInterleaveBase(VPDef::VPInterleaveSC, IG, Addr, StoredValues, Mask,
+ NeedsMaskForGaps, MD, DL) {}
+
+ ~VPInterleaveRecipe() override = default;
+
+ VPInterleaveRecipe *clone() override {
+ return new VPInterleaveRecipe(getInterleaveGroup(), getAddr(),
+ getStoredValues(), getMask(),
+ needsMaskForGaps(), *this, getDebugLoc());
}
+ VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+
/// Generate the wide load or store, and shuffles.
void execute(VPTransformState &State) override;
- /// Return the cost of this VPInterleaveRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
- const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+ }
- /// Returns the number of stored operands of this interleave group. Returns 0
- /// for load interleave groups.
- unsigned getNumStoreOperands() const {
- return getNumOperands() - (HasMask ? 2 : 1);
+ unsigned getNumStoreOperands() const override {
+ return getNumOperands() - (getMask() ? 2 : 1);
}
+};
+
+/// A recipe for interleaved memory operations with vector-predication
+/// intrinsics. The first operand is the address, the second operand is the
+/// explicit vector length. Stored values and mask are optional operands.
+class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
+public:
+ VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask)
+ : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(),
+ ArrayRef<VPValue *>({R.getAddr(), &EVL}),
+ R.getStoredValues(), Mask, R.needsMaskForGaps(), R,
+ R.getDebugLoc()) {
+ assert(!getInterleaveGroup()->isReverse() &&
+ "Reversed interleave-group with tail folding is not supported.");
+ assert(!needsMaskForGaps() && "Interleaved access with gap mask is not "
+ "supported for scalable vector.");
+ }
+
+ ~VPInterleaveEVLRecipe() override = default;
+
+ VPInterleaveEVLRecipe *clone() override {
+ llvm_unreachable("cloning not implemented yet");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC)
+
+ /// The VPValue of the explicit vector length.
+ VPValue *getEVL() const { return getOperand(1); }
- /// The recipe only uses the first lane of the address.
+ /// Generate the wide load or store, and shuffles.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The recipe only uses the first lane of the address, and EVL operand.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+ return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
+ Op == getEVL();
}
- Instruction *getInsertPos() const { return IG->getInsertPos(); }
+ unsigned getNumStoreOperands() const override {
+ return getNumOperands() - (getMask() ? 3 : 2);
+ }
};
/// A recipe to represent inloop reduction operations, performing a reduction on
@@ -2561,14 +2677,14 @@ protected:
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL = {})
+ bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
IsOrdered, DL) {}
VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL = {})
+ bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
IsOrdered, DL) {}
@@ -2686,7 +2802,7 @@ public:
class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
public:
VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPReductionRecipe(
VPDef::VPReductionEVLSC, R.getRecurrenceKind(),
R.getFastMathFlags(),
@@ -3537,7 +3653,8 @@ public:
InductionOpcode(Opcode) {}
VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
- VPValue *Step, VPValue *VF, DebugLoc DL = {})
+ VPValue *Step, VPValue *VF,
+ DebugLoc DL = DebugLoc::getUnknown())
: VPScalarIVStepsRecipe(
IV, Step, VF, IndDesc.getInductionOpcode(),
dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())
@@ -4142,7 +4259,7 @@ public:
/// Returns an iterator range over all VFs of the plan.
iterator_range<SmallSetVector<ElementCount, 2>::iterator>
vectorFactors() const {
- return {VFs.begin(), VFs.end()};
+ return VFs;
}
bool hasScalarVFOnly() const {
@@ -4299,9 +4416,8 @@ public:
/// via the other early exit).
bool hasEarlyExit() const {
return count_if(ExitBlocks,
- [](VPIRBasicBlock *EB) {
- return EB->getNumPredecessors() != 0;
- }) > 1 ||
+ [](VPIRBasicBlock *EB) { return EB->hasPredecessors(); }) >
+ 1 ||
(ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1);
}
@@ -4309,7 +4425,7 @@ public:
/// that this relies on unneeded branches to the scalar tail loop being
/// removed.
bool hasScalarTail() const {
- return !(getScalarPreheader()->getNumPredecessors() == 0 ||
+ return !(!getScalarPreheader()->hasPredecessors() ||
getScalarPreheader()->getSinglePredecessor() == getEntry());
}
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 747c6623aa22..d400ceff7797 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -296,7 +296,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
- .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
+ .Case<VPInterleaveBase>([V](const auto *R) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 80b48de57b40..cef91c15dd87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -193,6 +193,9 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
}
if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
+ // Don't emit recipes for unconditional switch instructions.
+ if (SI->getNumCases() == 0)
+ continue;
SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
for (auto Case : SI->cases())
Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
@@ -538,8 +541,7 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
}
void VPlanTransforms::handleEarlyExits(VPlan &Plan,
- bool HasUncountableEarlyExit,
- VFRange &Range) {
+ bool HasUncountableEarlyExit) {
auto *MiddleVPBB = cast<VPBasicBlock>(
Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
@@ -559,8 +561,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
assert(!HandledUncountableEarlyExit &&
"can handle exactly one uncountable early exit");
handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
- cast<VPBasicBlock>(HeaderVPB), LatchVPBB,
- Range);
+ cast<VPBasicBlock>(HeaderVPB), LatchVPBB);
HandledUncountableEarlyExit = true;
} else {
for (VPRecipeBase &R : EB->phis())
@@ -671,6 +672,90 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
}
}
+void VPlanTransforms::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
+ bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
+ const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) {
+ // Generate code to check if the loop's trip count is less than VF * UF, or
+ // equal to it in case a scalar epilogue is required; this implies that the
+ // vector trip count is zero. This check also covers the case where adding one
+ // to the backedge-taken count overflowed leading to an incorrect trip count
+ // of zero. In this case we will also jump to the scalar loop.
+ CmpInst::Predicate CmpPred =
+ RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ // If tail is to be folded, vector loop takes care of all iterations.
+ VPValue *TripCountVPV = Plan.getTripCount();
+ const SCEV *TripCount = vputils::getSCEVExprForVPValue(TripCountVPV, SE);
+ Type *TripCountTy = TripCount->getType();
+ auto GetMinTripCount = [&]() -> const SCEV * {
+ // Compute max(MinProfitableTripCount, UF * VF) and return it.
+ const SCEV *VFxUF =
+ SE.getElementCount(TripCountTy, (VF * UF), SCEV::FlagNUW);
+ if (UF * VF.getKnownMinValue() >=
+ MinProfitableTripCount.getKnownMinValue()) {
+ // TODO: SCEV should be able to simplify test.
+ return VFxUF;
+ }
+ const SCEV *MinProfitableTripCountSCEV =
+ SE.getElementCount(TripCountTy, MinProfitableTripCount, SCEV::FlagNUW);
+ return SE.getUMaxExpr(MinProfitableTripCountSCEV, VFxUF);
+ };
+
+ VPBasicBlock *EntryVPBB = Plan.getEntry();
+ VPBuilder Builder(EntryVPBB);
+ VPValue *TripCountCheck = Plan.getFalse();
+ const SCEV *Step = GetMinTripCount();
+ if (TailFolded) {
+ if (CheckNeededWithTailFolding) {
+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
+ // an overflow to zero when updating induction variables and so an
+ // additional overflow check is required before entering the vector loop.
+
+ // Get the maximum unsigned value for the type.
+ VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
+ TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
+ VPValue *DistanceToMax = Builder.createNaryOp(
+ Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
+ DebugLoc::getUnknown());
+
+ // Don't execute the vector loop if (UMax - n) < (VF * UF).
+ // FIXME: Should only check VF * UF, but currently checks Step=max(VF*UF,
+ // minProfitableTripCount).
+ TripCountCheck = Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax,
+ Builder.createExpandSCEV(Step), DL);
+ } else {
+ // TripCountCheck = false, folding tail implies positive vector trip
+ // count.
+ }
+ } else {
+ // TODO: Emit unconditional branch to vector preheader instead of
+ // conditional branch with known condition.
+ TripCount = SE.applyLoopGuards(TripCount, OrigLoop);
+ // Check if the trip count is < the step.
+ if (SE.isKnownPredicate(CmpPred, TripCount, Step)) {
+ // TODO: Ensure step is at most the trip count when determining max VF and
+ // UF, w/o tail folding.
+ TripCountCheck = Plan.getTrue();
+ } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(CmpPred),
+ TripCount, Step)) {
+ // Generate the minimum iteration check only if we cannot prove the
+ // check is known to be true, or known to be false.
+ VPValue *MinTripCountVPV = Builder.createExpandSCEV(Step);
+ TripCountCheck = Builder.createICmp(
+ CmpPred, TripCountVPV, MinTripCountVPV, DL, "min.iters.check");
+ } // else step known to be < trip count, use TripCountCheck preset to false.
+ }
+ VPInstruction *Term =
+ Builder.createNaryOp(VPInstruction::BranchOnCond, {TripCountCheck}, DL);
+ if (MinItersBypassWeights) {
+ MDBuilder MDB(Plan.getContext());
+ MDNode *BranchWeights = MDB.createBranchWeights(
+ ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false);
+ Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+ }
+}
+
bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 1ec6ae677374..109156c1469c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -145,6 +145,16 @@ inline int_pred_ty<is_all_ones> m_AllOnes() {
return int_pred_ty<is_all_ones>();
}
+struct is_zero_int {
+ bool isValue(const APInt &C) const { return C.isZero(); }
+};
+
+/// Match an integer 0 or a vector with all elements equal to 0.
+/// For vectors, this includes constants with undefined elements.
+inline int_pred_ty<is_zero_int> m_ZeroInt() {
+ return int_pred_ty<is_zero_int>();
+}
+
/// Matching combinators
template <typename LTy, typename RTy> struct match_combine_or {
LTy L;
@@ -218,9 +228,12 @@ struct Recipe_match {
if ((!matchRecipeAndOpcode<RecipeTys>(R) && ...))
return false;
- assert(R->getNumOperands() == std::tuple_size<Ops_t>::value &&
- "recipe with matched opcode does not have the expected number of "
- "operands");
+ if (R->getNumOperands() != std::tuple_size<Ops_t>::value) {
+ assert(Opcode == Instruction::PHI &&
+ "non-variadic recipe with matched opcode does not have the "
+ "expected number of operands");
+ return false;
+ }
auto IdxSeq = std::make_index_sequence<std::tuple_size<Ops_t>::value>();
if (all_of_tuple_elements(IdxSeq, [R](auto Op, unsigned Idx) {
@@ -302,14 +315,21 @@ m_Broadcast(const Op0_t &Op0) {
}
template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::ExplicitVectorLength, Op0_t>
+m_EVL(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExplicitVectorLength>(Op0);
+}
+
+template <typename Op0_t>
inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t>
m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
}
-template <typename Op0_t, typename Op1_t>
-inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t>
-m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
- return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
+
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
+m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
+ return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
}
template <typename Op0_t, typename Op1_t>
@@ -345,6 +365,12 @@ m_ZExtOrSExt(const Op0_t &Op0) {
return m_CombineOr(m_ZExt(Op0), m_SExt(Op0));
}
+template <typename Op0_t>
+inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>, Op0_t>
+m_ZExtOrSelf(const Op0_t &Op0) {
+ return m_CombineOr(m_ZExt(Op0), Op0);
+}
+
template <unsigned Opcode, typename Op0_t, typename Op1_t>
inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0,
const Op1_t &Op1) {
@@ -381,6 +407,13 @@ m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
}
+/// Match a binary AND operation.
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_commutative_match<Instruction::And, Op0_t, Op1_t>
+m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_c_Binary<Instruction::And, Op0_t, Op1_t>(Op0, Op1);
+}
+
/// Match a binary OR operation. Note that while conceptually the operands can
/// be matched commutatively, \p Commutative defaults to false in line with the
/// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index cdadc33e3088..0c27d535b680 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,11 +14,13 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanCFG.h"
+#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
#include "llvm/ADT/PostOrderIterator.h"
using namespace llvm;
+using namespace VPlanPatternMatch;
namespace {
class VPPredicator {
@@ -246,6 +248,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
"Distinct incoming values with one having a full mask");
break;
}
+
OperandsWithMask.push_back(EdgeMask);
}
PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c4fdcccc6d62..bf5148954309 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -52,8 +52,9 @@ bool VPRecipeBase::mayWriteToMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
- return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
+ return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
case VPWidenStoreSC:
return true;
@@ -142,6 +143,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return false;
}
default:
+ // FIXME: Return false if the recipe represents an interleaved store.
return true;
}
}
@@ -183,6 +185,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
case VPWidenLoadEVLSC:
@@ -255,7 +258,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
Instruction *UI = nullptr;
if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
- else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
+ else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
UI = IG->getInsertPos();
else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
UI = &WidenMem->getIngredient();
@@ -389,6 +392,42 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
+ assert(OpType == Other.OpType && "OpType must match");
+ switch (OpType) {
+ case OperationType::OverflowingBinOp:
+ WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
+ WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
+ break;
+ case OperationType::Trunc:
+ TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
+ TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
+ break;
+ case OperationType::DisjointOp:
+ DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
+ break;
+ case OperationType::PossiblyExactOp:
+ ExactFlags.IsExact &= Other.ExactFlags.IsExact;
+ break;
+ case OperationType::GEPOp:
+ GEPFlags &= Other.GEPFlags;
+ break;
+ case OperationType::FPMathOp:
+ FMFs.NoNaNs &= Other.FMFs.NoNaNs;
+ FMFs.NoInfs &= Other.FMFs.NoInfs;
+ break;
+ case OperationType::NonNegOp:
+ NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
+ break;
+ case OperationType::Cmp:
+ assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
+ break;
+ case OperationType::Other:
+ assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
+ break;
+ }
+}
+
FastMathFlags VPIRFlags::getFastMathFlags() const {
assert(OpType == OperationType::FPMathOp &&
"recipe doesn't have fast math flags");
@@ -471,7 +510,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case Instruction::ICmp:
case Instruction::FCmp:
case Instruction::Store:
- case VPInstruction::ActiveLaneMask:
case VPInstruction::BranchOnCount:
case VPInstruction::ComputeReductionResult:
case VPInstruction::FirstOrderRecurrenceSplice:
@@ -481,6 +519,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::WideIVStep:
return 2;
case Instruction::Select:
+ case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
return 3;
@@ -620,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
Name);
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = VectorType::get(Int1Ty, State.VF);
+ auto PredTy = VectorType::get(
+ Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+ ->getZExtValue());
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
@@ -875,9 +916,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::AnyOf: {
- Value *Res = State.get(getOperand(0));
+ Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
for (VPValue *Op : drop_begin(operands()))
- Res = Builder.CreateOr(Res, State.get(Op));
+ Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
}
case VPInstruction::ExtractLane: {
@@ -919,8 +960,15 @@ Value *VPInstruction::generate(VPTransformState &State) {
unsigned LastOpIdx = getNumOperands() - 1;
Value *Res = nullptr;
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
- Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
- Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
+ Value *TrailingZeros =
+ State.VF.isScalar()
+ ? Builder.CreateZExt(
+ Builder.CreateICmpEQ(State.get(getOperand(Idx)),
+ Builder.getFalse()),
+ Builder.getInt64Ty())
+ : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
+ State.get(getOperand(Idx)),
+ true, Name);
Value *Current = Builder.CreateAdd(
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
if (Res) {
@@ -1027,8 +1075,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
switch (getOpcode()) {
+ case Instruction::Select: {
+ // TODO: It may be possible to improve this by analyzing where the
+ // condition operand comes from.
+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
+ auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
+ if (!vputils::onlyFirstLaneUsed(this)) {
+ CondTy = toVectorTy(CondTy, VF);
+ VecTy = toVectorTy(VecTy, VF);
+ }
+ return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
+ Ctx.CostKind);
+ }
case Instruction::ExtractElement:
case VPInstruction::ExtractLane: {
+ if (VF.isScalar()) {
+ // ExtractLane with VF=1 takes care of handling extracting across multiple
+ // parts.
+ return 0;
+ }
+
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -1040,8 +1107,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::FirstActiveLane: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
+ if (VF.isScalar())
+ return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy),
+ CmpInst::ICMP_EQ, Ctx.CostKind);
// Calculate the cost of determining the lane index.
- auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+ auto *PredTy = toVectorTy(ScalarTy, VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
Type::getInt64Ty(Ctx.LLVMCtx),
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
@@ -1060,7 +1132,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
case VPInstruction::ActiveLaneMask: {
Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
- Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ unsigned Multiplier =
+ cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
+ Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
{ArgTy, ArgTy});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
@@ -1684,18 +1758,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
State.set(this, V);
}
-InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
+static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
+ ArrayRef<const VPValue *> Operands,
+ const VPRecipeWithIRFlags &R,
+ ElementCount VF,
+ VPCostContext &Ctx) {
// Some backends analyze intrinsic arguments to determine cost. Use the
// underlying value for the operand if it has one. Otherwise try to use the
// operand of the underlying call instruction, if there is one. Otherwise
// clear Arguments.
// TODO: Rework TTI interface to be independent of concrete IR values.
SmallVector<const Value *> Arguments;
- for (const auto &[Idx, Op] : enumerate(operands())) {
+ for (const auto &[Idx, Op] : enumerate(Operands)) {
auto *V = Op->getUnderlyingValue();
if (!V) {
- if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
+ if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
Arguments.push_back(UI->getArgOperand(Idx));
continue;
}
@@ -1705,21 +1783,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
Arguments.push_back(V);
}
- Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
+ Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
+ Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
SmallVector<Type *> ParamTys;
- for (unsigned I = 0; I != getNumOperands(); ++I)
- ParamTys.push_back(
- toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
+ for (const VPValue *Op : Operands) {
+ ParamTys.push_back(VF.isVector()
+ ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
+ : Ctx.Types.inferScalarType(Op));
+ }
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
- FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
+ FastMathFlags FMF =
+ R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
IntrinsicCostAttributes CostAttrs(
- VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+ ID, RetTy, Arguments, ParamTys, FMF,
+ dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
InstructionCost::getInvalid(), &Ctx.TLI);
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
+InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ SmallVector<const VPValue *> ArgOps(operands());
+ return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
+}
+
StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
return Intrinsic::getBaseName(VectorIntrinsicID);
}
@@ -2110,8 +2198,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
- // More complex computation, let the legacy cost-model handle this for now.
- return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
+ // If the div/rem operation isn't safe to speculate and requires
+ // predication, then the only way we can even create a vplan is to insert
+ // a select on the second input operand to ensure we use the value of 1
+ // for the inactive lanes. The select will be costed separately.
case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
@@ -2174,7 +2264,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
if (VF.isScalar())
return TTI::CastContextHint::Normal;
- if (isa<VPInterleaveRecipe>(R))
+ if (isa<VPInterleaveBase>(R))
return TTI::CastContextHint::Interleave;
if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
@@ -2756,10 +2846,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
assert(RedTy->isIntegerTy() &&
"VPExpressionRecipe only supports integer types currently.");
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(
+ cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
- unsigned Opcode = RecurrenceDescriptor::getOpcode(
- cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
return Ctx.TTI.getExtendedReductionCost(
Opcode,
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
@@ -2767,13 +2857,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
}
case ExpressionTypes::MulAccReduction:
- return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
+ return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
+ Ctx.CostKind);
case ExpressionTypes::ExtMulAccReduction:
return Ctx.TTI.getMulAccReductionCost(
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
Instruction::ZExt,
- RedTy, SrcVecTy, Ctx.CostKind);
+ Opcode, RedTy, SrcVecTy, Ctx.CostKind);
}
llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
}
@@ -3014,23 +3105,75 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// instruction cost.
return 0;
case Instruction::Call: {
- if (!isSingleScalar()) {
- // TODO: Handle remaining call costs here as well.
- if (VF.isScalable())
- return InstructionCost::getInvalid();
- break;
- }
-
auto *CalledFn =
cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
- if (CalledFn->isIntrinsic())
- break;
+ SmallVector<const VPValue *> ArgOps(drop_end(operands()));
SmallVector<Type *, 4> Tys;
- for (VPValue *ArgOp : drop_end(operands()))
+ for (const VPValue *ArgOp : ArgOps)
Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
+
+ if (CalledFn->isIntrinsic())
+ // Various pseudo-intrinsics with costs of 0 are scalarized instead of
+ // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
+ switch (CalledFn->getIntrinsicID()) {
+ case Intrinsic::assume:
+ case Intrinsic::lifetime_end:
+ case Intrinsic::lifetime_start:
+ case Intrinsic::sideeffect:
+ case Intrinsic::pseudoprobe:
+ case Intrinsic::experimental_noalias_scope_decl: {
+ assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+ ElementCount::getFixed(1), Ctx) == 0 &&
+ "scalarizing intrinsic should be free");
+ return InstructionCost(0);
+ }
+ default:
+ break;
+ }
+
Type *ResultTy = Ctx.Types.inferScalarType(this);
- return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ InstructionCost ScalarCallCost =
+ Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ if (isSingleScalar()) {
+ if (CalledFn->isIntrinsic())
+ ScalarCallCost = std::min(
+ ScalarCallCost,
+ getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+ ElementCount::getFixed(1), Ctx));
+ return ScalarCallCost;
+ }
+
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+
+ // Compute the cost of scalarizing the result and operands if needed.
+ InstructionCost ScalarizationCost = 0;
+ if (VF.isVector()) {
+ if (!ResultTy->isVoidTy()) {
+ for (Type *VectorTy :
+ to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) {
+ ScalarizationCost += Ctx.TTI.getScalarizationOverhead(
+ cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+ /*Insert=*/true,
+ /*Extract=*/false, Ctx.CostKind);
+ }
+ }
+ // Skip operands that do not require extraction/scalarization and do not
+ // incur any overhead.
+ SmallPtrSet<const VPValue *, 4> UniqueOperands;
+ Tys.clear();
+ for (auto *Op : ArgOps) {
+ if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+ !UniqueOperands.insert(Op).second)
+ continue;
+ Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF));
+ }
+ ScalarizationCost +=
+ Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind);
+ }
+
+ return ScalarCallCost * VF.getFixedValue() + ScalarizationCost;
}
case Instruction::Add:
case Instruction::Sub:
@@ -3045,10 +3188,29 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
return *getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
Ctx) *
(isSingleScalar() ? 1 : VF.getFixedValue());
+ case Instruction::Load:
+ case Instruction::Store: {
+ if (isSingleScalar()) {
+ bool IsLoad = UI->getOpcode() == Instruction::Load;
+ Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+ Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
+ const Align Alignment = getLoadStoreAlignment(UI);
+ unsigned AS = getLoadStoreAddressSpace(UI);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+ InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+ UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
+ return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+ ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
+ }
+ // TODO: See getMemInstScalarizationCost for how to handle replicating and
+ // predicated cases.
+ break;
}
}
@@ -3181,10 +3343,17 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
// TODO: Using the original IR may not be accurate.
// Currently, ARM will use the underlying IR to calculate gather/scatter
// instruction cost.
- const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
- Type *PtrTy = toVectorTy(Ptr->getType(), VF);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
+
+ const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+ Type *PtrTy = Ptr->getType();
+
+ // If the address value is uniform across all lanes, then the address can be
+ // calculated with scalar type and broadcast.
+ if (!vputils::isSingleScalar(getAddr()))
+ PtrTy = toVectorTy(PtrTy, VF);
+
return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
Ctx.CostKind) +
Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
@@ -3532,9 +3701,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Interleave group being replicated.");
- assert((!NeedsMaskForGaps || !State.VF.isScalable()) &&
+ assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
"Masking gaps for scalable vectors is not yet supported.");
- const InterleaveGroup<Instruction> *Group = IG;
+ const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
Instruction *Instr = Group->getInsertPos();
// Prepare for the vector type of the interleaved load/store.
@@ -3574,7 +3743,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
Value *MaskForGaps = nullptr;
- if (NeedsMaskForGaps) {
+ if (needsMaskForGaps()) {
MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
@@ -3651,7 +3820,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved store group.
Value *MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
- assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) &&
+ assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
"Mismatch between NeedsMaskForGaps and MaskForGaps");
ArrayRef<VPValue *> StoredValues = getStoredValues();
// Collect the stored vector from each member.
@@ -3702,6 +3871,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
+ const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
@@ -3730,8 +3900,152 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
+ assert(!State.Lane && "Interleave group being replicated.");
+ assert(State.VF.isScalable() &&
+ "Only support scalable VF for EVL tail-folding.");
+ assert(!needsMaskForGaps() &&
+ "Masking gaps for scalable vectors is not yet supported.");
+ const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
+ Instruction *Instr = Group->getInsertPos();
+
+ // Prepare for the vector type of the interleaved load/store.
+ Type *ScalarTy = getLoadStoreType(Instr);
+ unsigned InterleaveFactor = Group->getFactor();
+ assert(InterleaveFactor <= 8 &&
+ "Unsupported deinterleave/interleave factor for scalable vectors");
+ ElementCount WideVF = State.VF * InterleaveFactor;
+ auto *VecTy = VectorType::get(ScalarTy, WideVF);
+
+ VPValue *Addr = getAddr();
+ Value *ResAddr = State.get(Addr, VPLane(0));
+ Value *EVL = State.get(getEVL(), VPLane(0));
+ Value *InterleaveEVL = State.Builder.CreateMul(
+ EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
+ /* NUW= */ true, /* NSW= */ true);
+ LLVMContext &Ctx = State.Builder.getContext();
+
+ Value *GroupMask = nullptr;
+ if (VPValue *BlockInMask = getMask()) {
+ SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
+ GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
+ } else {
+ GroupMask =
+ State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
+ }
+
+ // Vectorize the interleaved load group.
+ if (isa<LoadInst>(Instr)) {
+ CallInst *NewLoad = State.Builder.CreateIntrinsic(
+ VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
+ "wide.vp.load");
+ NewLoad->addParamAttr(0,
+ Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+ applyMetadata(*NewLoad);
+ // TODO: Also manage existing metadata using VPIRMetadata.
+ Group->addMetadata(NewLoad);
+
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ NewLoad = State.Builder.CreateIntrinsic(
+ Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+ NewLoad->getType(), NewLoad,
+ /*FMFSource=*/nullptr, "strided.vec");
+
+ const DataLayout &DL = Instr->getDataLayout();
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+ // Skip the gaps in the group.
+ if (!Member)
+ continue;
+
+ Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
+ VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
+ StridedVec =
+ createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
+ }
+
+ State.set(getVPValue(J), StridedVec);
+ ++J;
+ }
+ return;
+ } // End for interleaved load.
+
+ // The sub vector type for current instruction.
+ auto *SubVT = VectorType::get(ScalarTy, State.VF);
+ // Vectorize the interleaved store group.
+ ArrayRef<VPValue *> StoredValues = getStoredValues();
+ // Collect the stored vector from each member.
+ SmallVector<Value *, 4> StoredVecs;
+ const DataLayout &DL = Instr->getDataLayout();
+ for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
+ Instruction *Member = Group->getMember(I);
+ // Skip the gaps in the group.
+ if (!Member) {
+ StoredVecs.push_back(PoisonValue::get(SubVT));
+ continue;
+ }
+
+ Value *StoredVec = State.get(StoredValues[StoredIdx]);
+ // If this member has different type, cast it to a unified type.
+ if (StoredVec->getType() != SubVT)
+ StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
+
+ StoredVecs.push_back(StoredVec);
+ ++StoredIdx;
+ }
+
+ // Interleave all the smaller vectors into one wider vector.
+ Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
+ CallInst *NewStore =
+ State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
+ {IVec, ResAddr, GroupMask, InterleaveEVL});
+ NewStore->addParamAttr(1,
+ Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+ applyMetadata(*NewStore);
+ // TODO: Also manage existing metadata using VPIRMetadata.
+ Group->addMetadata(NewStore);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
+ O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ O << ", ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getEVL()->printAsOperand(O, SlotTracker);
+ if (VPValue *Mask = getMask()) {
+ O << ", ";
+ Mask->printAsOperand(O, SlotTracker);
+ }
+
+ unsigned OpIdx = 0;
+ for (unsigned i = 0; i < IG->getFactor(); ++i) {
+ if (!IG->getMember(i))
+ continue;
+ if (getNumStoreOperands() > 0) {
+ O << "\n" << Indent << " vp.store ";
+ getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
+ O << " to index " << i;
+ } else {
+ O << "\n" << Indent << " ";
+ getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
+ O << " = vp.load from index " << i;
+ }
+ ++OpIdx;
+ }
+}
+#endif
+
+InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
Instruction *InsertPos = getInsertPos();
// Find the VPValue index of the interleave group. We need to skip gaps.
unsigned InsertPosIdx = 0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e0bf241c73fd..2cac5557daee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -29,6 +29,7 @@
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
@@ -39,6 +40,10 @@
using namespace llvm;
using namespace VPlanPatternMatch;
+cl::opt<bool> EnableWideActiveLaneMask(
+ "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+ cl::desc("Enable use of wide get active lane mask instructions"));
+
bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
VPlanPtr &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
@@ -142,7 +147,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
for (VPValue *Op : Recipe.operands())
if (auto *Def =
dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- WorkList.insert(std::make_pair(VPBB, Def));
+ WorkList.insert({VPBB, Def});
}
}
@@ -206,7 +211,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
for (VPValue *Op : SinkCandidate->operands())
if (auto *Def =
dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- WorkList.insert(std::make_pair(SinkTo, Def));
+ WorkList.insert({SinkTo, Def});
Changed = true;
}
return Changed;
@@ -344,7 +349,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
auto *BlockInMask = PredRecipe->getMask();
auto *MaskDef = BlockInMask->getDefiningRecipe();
auto *BOMRecipe = new VPBranchOnMaskRecipe(
- BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());
+ BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
auto *Entry =
Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
@@ -859,8 +864,8 @@ static VPValue *optimizeLatchExitInductionUser(
Type *StepTy = TypeInfo.inferScalarType(Step);
auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
return B.createPtrAdd(EndValue,
- B.createNaryOp(Instruction::Sub, {Zero, Step}), {},
- "ind.escape");
+ B.createNaryOp(Instruction::Sub, {Zero, Step}),
+ DebugLoc::getUnknown(), "ind.escape");
}
if (ScalarTy->isFloatingPointTy()) {
const auto &ID = WideIV->getInductionDescriptor();
@@ -910,10 +915,10 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
if (!ExpR)
continue;
- auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
- if (I.second)
+ const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
+ if (Inserted)
continue;
- ExpR->replaceAllUsesWith(I.first->second);
+ ExpR->replaceAllUsesWith(V->second);
ExpR->eraseFromParent();
}
}
@@ -1067,7 +1072,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
// TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
// && (Y || Z) and (X || !X) into true. This requires queuing newly created
// recipes to be visited during simplification.
- VPValue *X, *Y;
+ VPValue *X, *Y, *Z;
if (match(Def,
m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) {
@@ -1076,13 +1081,37 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- // OR x, 1 -> 1.
- if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
- Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1)
- : Def->getOperand(0));
- Def->eraseFromParent();
- return;
- }
+ // x | 1 -> 1
+ if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
+ return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
+
+ // x | 0 -> x
+ if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
+ return Def->replaceAllUsesWith(X);
+
+ // x & 0 -> 0
+ if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
+ return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
+
+ // x && false -> false
+ if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
+ return Def->replaceAllUsesWith(Def->getOperand(1));
+
+ // (x && y) || (x && z) -> x && (y || z)
+ VPBuilder Builder(Def);
+ if (match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+ m_LogicalAnd(m_Deferred(X), m_VPValue(Z)))) &&
+ // Simplify only if one of the operands has one use to avoid creating an
+ // extra recipe.
+ (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
+ !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
+ return Def->replaceAllUsesWith(
+ Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
+
+ // x && !x -> 0
+ if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
+ return Def->replaceAllUsesWith(Plan->getOrAddLiveIn(
+ ConstantInt::getFalse(VPTypeAnalysis(*Plan).inferScalarType(Def))));
if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
return Def->replaceAllUsesWith(X);
@@ -1096,6 +1125,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
+ // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
+ // tail folding it is likely that x is a header mask and can be simplified
+ // further.
+ if (match(Def, m_LogicalAnd(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+ m_VPValue(Z))) &&
+ X->hasMoreThanOneUniqueUser())
+ return Def->replaceAllUsesWith(
+ Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
+
if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
return Def->replaceAllUsesWith(A);
@@ -1150,7 +1188,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
m_VPValue(X), m_SpecificInt(1)))) {
Type *WideStepTy = TypeInfo.inferScalarType(Def);
if (TypeInfo.inferScalarType(X) != WideStepTy)
- X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy);
+ X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
Def->replaceAllUsesWith(X);
return;
}
@@ -1232,11 +1270,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- VPInstruction *OpVPI;
- if (match(Def, m_ExtractLastElement(m_VPInstruction(OpVPI))) &&
- OpVPI->isVectorToScalar()) {
- Def->replaceAllUsesWith(OpVPI);
- return;
+ if (match(Def,
+ m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) &&
+ vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) {
+ return U->usesScalars(A) || Def == U;
+ })) {
+ return Def->replaceAllUsesWith(A);
}
}
@@ -1269,11 +1308,29 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
continue;
auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
+ if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
+ vputils::isSingleScalar(RepR->getOperand(1))) {
+ auto *Clone = new VPReplicateRecipe(
+ RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
+ true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
+ Clone->insertBefore(RepOrWidenR);
+ auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
+ {Clone->getOperand(0)});
+ Ext->insertBefore(Clone);
+ Clone->setOperand(0, Ext);
+ RepR->eraseFromParent();
+ continue;
+ }
+
// Skip recipes that aren't single scalars or don't have only their
// scalar results used. In the latter case, we would introduce extra
// broadcasts.
if (!vputils::isSingleScalar(RepOrWidenR) ||
- !vputils::onlyScalarValuesUsed(RepOrWidenR))
+ !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
+ return U->usesScalars(RepOrWidenR) ||
+ match(cast<VPRecipeBase>(U),
+ m_ExtractLastElement(m_VPValue()));
+ }))
continue;
auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
@@ -1285,6 +1342,23 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
}
}
+/// Try to see if all of \p Blend's masks share a common value logically and'ed
+/// and remove it from the masks.
+static void removeCommonBlendMask(VPBlendRecipe *Blend) {
+ if (Blend->isNormalized())
+ return;
+ VPValue *CommonEdgeMask;
+ if (!match(Blend->getMask(0),
+ m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
+ return;
+ for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
+ if (!match(Blend->getMask(I),
+ m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
+ return;
+ for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
+ Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
+}
+
/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
/// to make sure the masks are simplified.
static void simplifyBlends(VPlan &Plan) {
@@ -1295,6 +1369,8 @@ static void simplifyBlends(VPlan &Plan) {
if (!Blend)
continue;
+ removeCommonBlendMask(Blend);
+
// Try to remove redundant blend recipes.
SmallPtrSet<VPValue *, 4> UniqueValues;
if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
@@ -1467,6 +1543,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
}
+/// Try to replace multiple active lane masks used for control flow with
+/// a single, wide active lane mask instruction followed by multiple
+/// extract subvector intrinsics. This applies to the active lane mask
+/// instructions both in the loop and in the preheader.
+/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
+/// new extracts from the first active lane mask, which has it's last
+/// operand (multiplier) set to UF.
+static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
+ unsigned UF) {
+ if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
+ return false;
+
+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
+ auto *Term = &ExitingVPBB->back();
+
+ using namespace llvm::VPlanPatternMatch;
+ if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
+ m_VPValue(), m_VPValue(), m_VPValue())))))
+ return false;
+
+ auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+ LLVMContext &Ctx = Plan.getContext();
+
+ auto ExtractFromALM = [&](VPInstruction *ALM,
+ SmallVectorImpl<VPValue *> &Extracts) {
+ DebugLoc DL = ALM->getDebugLoc();
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<VPValue *> Ops;
+ Ops.append({ALM, Plan.getOrAddLiveIn(
+ ConstantInt::get(IntegerType::getInt64Ty(Ctx),
+ VF.getKnownMinValue() * Part))});
+ auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
+ IntegerType::getInt1Ty(Ctx), DL);
+ Extracts[Part] = Ext;
+ Ext->insertAfter(ALM);
+ }
+ };
+
+ // Create a list of each active lane mask phi, ordered by unroll part.
+ SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
+ for (VPRecipeBase &R : Header->phis()) {
+ auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
+ if (!Phi)
+ continue;
+ VPValue *Index = nullptr;
+ match(Phi->getBackedgeValue(),
+ m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue()));
+ assert(Index && "Expected index from ActiveLaneMask instruction");
+
+ auto *II = dyn_cast<VPInstruction>(Index);
+ if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
+ auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue());
+ Phis[Part->getZExtValue()] = Phi;
+ } else
+ // Anything other than a CanonicalIVIncrementForPart is part 0
+ Phis[0] = Phi;
+ }
+
+ assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
+ "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
+
+ auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
+ auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
+
+ assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
+ LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
+ "Expected incoming values of Phi to be ActiveLaneMasks");
+
+ // When using wide lane masks, the return type of the get.active.lane.mask
+ // intrinsic is VF x UF (last operand).
+ VPValue *ALMMultiplier =
+ Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+ EntryALM->setOperand(2, ALMMultiplier);
+ LoopALM->setOperand(2, ALMMultiplier);
+
+ // Create UF x extract vectors and insert into preheader.
+ SmallVector<VPValue *> EntryExtracts(UF);
+ ExtractFromALM(EntryALM, EntryExtracts);
+
+ // Create UF x extract vectors and insert before the loop compare & branch,
+ // updating the compare to use the first extract.
+ SmallVector<VPValue *> LoopExtracts(UF);
+ ExtractFromALM(LoopALM, LoopExtracts);
+ VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
+ Not->setOperand(0, LoopExtracts[0]);
+
+ // Update the incoming values of active lane mask phis.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Phis[Part]->setStartValue(EntryExtracts[Part]);
+ Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
+ }
+
+ return true;
+}
+
/// Try to simplify the branch condition of \p Plan. This may restrict the
/// resulting plan to \p BestVF and \p BestUF.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
@@ -1478,8 +1650,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
VPValue *Cond;
ScalarEvolution &SE = *PSE.getSE();
if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
- match(Term, m_BranchOnCond(
- m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
+ match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
+ m_VPValue(), m_VPValue(), m_VPValue()))))) {
// Try to simplify the branch condition if TC <= VF * UF when the latch
// terminator is BranchOnCount or BranchOnCond where the input is
// Not(ActiveLaneMask).
@@ -1558,8 +1730,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
- bool MadeChange =
- simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
+ bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
+ MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
if (MadeChange) {
@@ -1792,6 +1964,110 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
}
}
+namespace {
+struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
+ static bool isSentinel(const VPSingleDefRecipe *Def) {
+ return Def == getEmptyKey() || Def == getTombstoneKey();
+ }
+
+ /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
+ /// Returns an optional pair, where the first element indicates whether it is
+ /// an intrinsic ID.
+ static std::optional<std::pair<bool, unsigned>>
+ getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
+ return TypeSwitch<const VPSingleDefRecipe *,
+ std::optional<std::pair<bool, unsigned>>>(R)
+ .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
+ VPWidenSelectRecipe, VPReplicateRecipe>(
+ [](auto *I) { return std::make_pair(false, I->getOpcode()); })
+ .Case<VPWidenIntrinsicRecipe>([](auto *I) {
+ return std::make_pair(true, I->getVectorIntrinsicID());
+ })
+ .Default([](auto *) { return std::nullopt; });
+ }
+
+ /// Returns true if recipe \p Def can be safely handed for CSE.
+ static bool canHandle(const VPSingleDefRecipe *Def) {
+ // We can extend the list of handled recipes in the future,
+ // provided we account for the data embedded in them while checking for
+ // equality or hashing.
+ auto C = getOpcodeOrIntrinsicID(Def);
+
+ // The issue with (Insert|Extract)Value is that the index of the
+ // insert/extract is not a proper operand in LLVM IR, and hence also not in
+ // VPlan.
+ if (!C || (!C->first && (C->second == Instruction::InsertValue ||
+ C->second == Instruction::ExtractValue)))
+ return false;
+
+ // During CSE, we can only handle recipes that don't read from memory: if
+ // they read from memory, there could be an intervening write to memory
+ // before the next instance is CSE'd, leading to an incorrect result.
+ return !Def->mayReadFromMemory();
+ }
+
+ /// Hash the underlying data of \p Def.
+ static unsigned getHashValue(const VPSingleDefRecipe *Def) {
+ const VPlan *Plan = Def->getParent()->getPlan();
+ VPTypeAnalysis TypeInfo(*Plan);
+ hash_code Result = hash_combine(
+ Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
+ TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
+ hash_combine_range(Def->operands()));
+ if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
+ if (RFlags->hasPredicate())
+ return hash_combine(Result, RFlags->getPredicate());
+ return Result;
+ }
+
+ /// Check equality of underlying data of \p L and \p R.
+ static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
+ if (isSentinel(L) || isSentinel(R))
+ return L == R;
+ if (L->getVPDefID() != R->getVPDefID() ||
+ getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||
+ vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||
+ !equal(L->operands(), R->operands()))
+ return false;
+ if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
+ if (LFlags->hasPredicate() &&
+ LFlags->getPredicate() !=
+ cast<VPRecipeWithIRFlags>(R)->getPredicate())
+ return false;
+ const VPlan *Plan = L->getParent()->getPlan();
+ VPTypeAnalysis TypeInfo(*Plan);
+ return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
+ }
+};
+} // end anonymous namespace
+
+/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
+/// Plan.
+void VPlanTransforms::cse(VPlan &Plan) {
+ VPDominatorTree VPDT(Plan);
+ DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getEntry()))) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
+ if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
+ continue;
+ if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
+ // V must dominate Def for a valid replacement.
+ if (!VPDT.dominates(V->getParent(), VPBB))
+ continue;
+ // Only keep flags present on both V and Def.
+ if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
+ RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
+ Def->replaceAllUsesWith(V);
+ continue;
+ }
+ CSEMap[Def] = Def;
+ }
+ }
+}
+
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
static void licm(VPlan &Plan) {
VPBasicBlock *Preheader = Plan.getVectorPreheader();
@@ -1953,10 +2229,10 @@ void VPlanTransforms::optimize(VPlan &Plan) {
runPass(removeRedundantInductionCasts, Plan);
runPass(simplifyRecipes, Plan);
- runPass(simplifyBlends, Plan);
runPass(removeDeadRecipes, Plan);
- runPass(narrowToSingleScalarRecipes, Plan);
+ runPass(simplifyBlends, Plan);
runPass(legalizeAndOptimizeInductions, Plan);
+ runPass(narrowToSingleScalarRecipes, Plan);
runPass(removeRedundantExpandSCEVRecipes, Plan);
runPass(simplifyRecipes, Plan);
runPass(removeBranchOnConst, Plan);
@@ -2042,13 +2318,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
"index.part.next");
// Create the active lane mask instruction in the VPlan preheader.
- auto *EntryALM =
- Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
- DL, "active.lane.mask.entry");
+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+ ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+ auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+ {EntryIncrement, TC, ALMMultiplier}, DL,
+ "active.lane.mask.entry");
// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
- auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+ auto *LaneMaskPhi =
+ new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown());
LaneMaskPhi->insertAfter(CanonicalIVPHI);
// Create the active lane mask for the next iteration of the loop before the
@@ -2059,8 +2338,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
{IncrementValue}, {false, false}, DL);
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
- {InLoopIncrement, TripCount}, DL,
- "active.lane.mask.next");
+ {InLoopIncrement, TripCount, ALMMultiplier},
+ DL, "active.lane.mask.next");
LaneMaskPhi->addOperand(ALM);
// Replace the original terminator with BranchOnCond. We have to invert the
@@ -2077,12 +2356,10 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
/// for the header-mask pattern manually.
static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
SmallVector<VPValue *> WideCanonicalIVs;
- auto *FoundWidenCanonicalIVUser =
- find_if(Plan.getCanonicalIV()->users(),
- [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+ auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
+ IsaPred<VPWidenCanonicalIVRecipe>);
assert(count_if(Plan.getCanonicalIV()->users(),
- [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
- 1 &&
+ IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
"Must have at most one VPWideCanonicalIVRecipe");
if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
auto *WideCanonicalIV =
@@ -2125,9 +2402,8 @@ void VPlanTransforms::addActiveLaneMask(
"DataAndControlFlowWithoutRuntimeCheck implies "
"UseActiveLaneMaskForControlFlow");
- auto *FoundWidenCanonicalIVUser =
- find_if(Plan.getCanonicalIV()->users(),
- [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+ auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
+ IsaPred<VPWidenCanonicalIVRecipe>);
assert(FoundWidenCanonicalIVUser &&
"Must have widened canonical IV when tail folding!");
VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
@@ -2139,9 +2415,12 @@ void VPlanTransforms::addActiveLaneMask(
Plan, DataAndControlFlowWithoutRuntimeCheck);
} else {
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
- LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
- {WideCanonicalIV, Plan.getTripCount()}, nullptr,
- "active.lane.mask");
+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+ ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+ LaneMask =
+ B.createNaryOp(VPInstruction::ActiveLaneMask,
+ {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
+ nullptr, "active.lane.mask");
}
// Walk users of WideCanonicalIV and replace the header mask of the form
@@ -2205,6 +2484,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPValue *NewAddr = GetNewAddr(S->getAddr());
return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
})
+ .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
+ VPValue *NewMask = GetNewMask(IR->getMask());
+ return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
+ })
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
@@ -2271,11 +2554,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPBuilder Builder(LoopRegion->getPreheaderVPBB());
MaxEVL = Builder.createScalarZExtOrTrunc(
MaxEVL, Type::getInt32Ty(Plan.getContext()),
- TypeInfo.inferScalarType(MaxEVL), DebugLoc());
+ TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
Builder.setInsertPoint(Header, Header->getFirstNonPhi());
- VPValue *PrevEVL =
- Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
+ VPValue *PrevEVL = Builder.createScalarPhi(
+ {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
@@ -2327,16 +2610,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
if (!EVLRecipe)
continue;
- [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
+ unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
"New recipe must define the same number of values as the "
"original.");
- assert(NumDefVal <= 1 &&
- "Only supports recipes with a single definition or without users.");
EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
- VPValue *CurVPV = CurRecipe->getVPSingleValue();
- CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>(
+ EVLRecipe)) {
+ for (unsigned I = 0; I < NumDefVal; ++I) {
+ VPValue *CurVPV = CurRecipe->getVPValue(I);
+ CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
+ }
}
ToErase.push_back(CurRecipe);
}
@@ -2404,7 +2688,7 @@ void VPlanTransforms::addExplicitVectorLength(
VPValue *StartV = CanonicalIVPHI->getStartValue();
// Create the ExplicitVectorLengthPhi recipe in the main loop.
- auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
+ auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
EVLPhi->insertAfter(CanonicalIVPHI);
VPBuilder Builder(Header, Header->getFirstNonPhi());
// Create the AVL (application vector length), starting from TC -> 0 in steps
@@ -2418,10 +2702,11 @@ void VPlanTransforms::addExplicitVectorLength(
VPValue *AVLSafe =
Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
- AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
+ AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
+ "safe_avl");
}
auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
- DebugLoc());
+ DebugLoc::getUnknown());
auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
@@ -2473,6 +2758,22 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
+ VPValue *AVL;
+ [[maybe_unused]] bool FoundAVL =
+ match(EVLIncrement,
+ m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
+ assert(FoundAVL && "Didn't find AVL?");
+
+ // The AVL may be capped to a safe distance.
+ VPValue *SafeAVL;
+ if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
+ AVL = SafeAVL;
+
+ VPValue *AVLNext;
+ [[maybe_unused]] bool FoundAVLNext =
+ match(AVL, m_VPInstruction<Instruction::PHI>(
+ m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
+ assert(FoundAVLNext && "Didn't find AVL backedge?");
// Convert EVLPhi to concrete recipe.
auto *ScalarR =
@@ -2496,7 +2797,7 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
// Replace the use of VectorTripCount in the latch-exiting block.
// Before: (branch-on-count EVLIVInc, VectorTripCount)
- // After: (branch-on-count EVLIVInc, TripCount)
+ // After: (branch-on-cond eq AVLNext, 0)
VPBasicBlock *LatchExiting =
HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
@@ -2509,7 +2810,54 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
m_BranchOnCount(m_VPValue(EVLIncrement),
m_Specific(&Plan.getVectorTripCount()))) &&
"Unexpected terminator in EVL loop");
- LatchExitingBr->setOperand(1, Plan.getTripCount());
+
+ Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
+ VPBuilder Builder(LatchExitingBr);
+ VPValue *Cmp =
+ Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
+ Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));
+ Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
+ LatchExitingBr->eraseFromParent();
+}
+
+void VPlanTransforms::replaceSymbolicStrides(
+ VPlan &Plan, PredicatedScalarEvolution &PSE,
+ const DenseMap<Value *, const SCEV *> &StridesMap) {
+ // Replace VPValues for known constant strides guaranteed by predicate scalar
+ // evolution.
+ auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
+ auto *R = cast<VPRecipeBase>(&U);
+ return R->getParent()->getParent() ||
+ R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
+ };
+ for (const SCEV *Stride : StridesMap.values()) {
+ using namespace SCEVPatternMatch;
+ auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
+ const APInt *StrideConst;
+ if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
+ // Only handle constant strides for now.
+ continue;
+
+ auto *CI =
+ Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
+ if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
+ StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
+
+ // The versioned value may not be used in the loop directly but through a
+ // sext/zext. Add new live-ins in those cases.
+ for (Value *U : StrideV->users()) {
+ if (!isa<SExtInst, ZExtInst>(U))
+ continue;
+ VPValue *StrideVPV = Plan.getLiveIn(U);
+ if (!StrideVPV)
+ continue;
+ unsigned BW = U->getType()->getScalarSizeInBits();
+ APInt C =
+ isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
+ VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+ StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
+ }
+ }
}
void VPlanTransforms::dropPoisonGeneratingRecipes(
@@ -2785,8 +3133,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
- Init =
- Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
+ Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
+ DebugLoc::getUnknown(), "induction");
// Create the widened phi of the vector IV.
auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
@@ -2983,9 +3331,11 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
R->eraseFromParent();
}
-void VPlanTransforms::handleUncountableEarlyExit(
- VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
- VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
+void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
+ VPBasicBlock *EarlyExitVPBB,
+ VPlan &Plan,
+ VPBasicBlock *HeaderVPBB,
+ VPBasicBlock *LatchVPBB) {
VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
if (!EarlyExitVPBB->getSinglePredecessor() &&
EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
@@ -3038,13 +3388,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
}
VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
- auto IsVector = [](ElementCount VF) { return VF.isVector(); };
- // When the VFs are vectors, need to add `extract` to get the incoming value
- // from early exit. When the range contains scalar VF, limit the range to
- // scalar VF to prevent mis-compilation for the range containing both scalar
- // and vector VFs.
- if (!IncomingFromEarlyExit->isLiveIn() &&
- LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) {
+ if (!IncomingFromEarlyExit->isLiveIn()) {
// Update the incoming value from the early exit.
VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
@@ -3125,7 +3469,7 @@ static VPExpressionRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPCostContext &Ctx, VFRange &Range) {
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
- if (Opcode != Instruction::Add)
+ if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
return nullptr;
Type *RedTy = Ctx.Types.inferScalarType(Red);
@@ -3140,8 +3484,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
Type *SrcTy =
Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
- InstructionCost MulAccCost =
- Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);
+ InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
+ isZExt, Opcode, RedTy, SrcVecTy, CostKind);
InstructionCost MulCost = Mul->computeCost(VF, Ctx);
InstructionCost RedCost = Red->computeCost(VF, Ctx);
InstructionCost ExtCost = 0;
@@ -3506,6 +3850,21 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
Plan.resetTripCount(Exp);
ExpSCEV->eraseFromParent();
}
+ assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
+ "VPExpandSCEVRecipes must be at the beginning of the entry block, "
+ "after any VPIRInstructions");
+ // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
+ // to the VPIRBasicBlock.
+ auto EI = Entry->begin();
+ for (Instruction &I : drop_end(*EntryBB)) {
+ if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
+ &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
+ EI++;
+ continue;
+ }
+ VPIRInstruction::create(I)->insertBefore(*Entry, EI);
+ }
+
return ExpandedSCEVs;
}
@@ -3574,12 +3933,12 @@ static bool isAlreadyNarrow(VPValue *VPV) {
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth) {
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
- if (VF.isScalable() || !VectorLoop)
+ if (!VectorLoop)
return;
VPTypeAnalysis TypeInfo(Plan);
- unsigned FixedVF = VF.getFixedValue();
+ unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
@@ -3615,7 +3974,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
continue;
// Bail out on non-consecutive interleave groups.
- if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
+ if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
VectorRegWidth))
return;
@@ -3672,9 +4031,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
return;
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
- auto NarrowOp = [](VPValue *V) -> VPValue * {
+ SmallPtrSet<VPValue *, 4> NarrowedOps;
+ auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
auto *R = V->getDefiningRecipe();
- if (!R)
+ if (!R || NarrowedOps.contains(V))
return V;
if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
// Narrow interleave group to wide load, as transformed VPlan will only
@@ -3684,6 +4044,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
/*Reverse=*/false, {}, LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
+ NarrowedOps.insert(L);
return L;
}
@@ -3691,6 +4052,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
assert(RepR->isSingleScalar() &&
isa<LoadInst>(RepR->getUnderlyingInstr()) &&
"must be a single scalar load");
+ NarrowedOps.insert(RepR);
return RepR;
}
auto *WideLoad = cast<VPWidenLoadRecipe>(R);
@@ -3704,6 +4066,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
/*IsUniform*/ true,
/*Mask*/ nullptr, *WideLoad);
N->insertBefore(WideLoad);
+ NarrowedOps.insert(N);
return N;
};
@@ -3734,10 +4097,21 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// original iteration.
auto *CanIV = Plan.getCanonicalIV();
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
- Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
- CanIV->getScalarType(), 1 * Plan.getUF())));
- Plan.getVF().replaceAllUsesWith(
- Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+ VPBuilder PHBuilder(Plan.getVectorPreheader());
+
+ VPValue *UF = Plan.getOrAddLiveIn(
+ ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+ if (VF.isScalable()) {
+ VPValue *VScale = PHBuilder.createElementCount(
+ CanIV->getScalarType(), ElementCount::getScalable(1));
+ VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
+ Inc->setOperand(1, VScaleUF);
+ Plan.getVF().replaceAllUsesWith(VScale);
+ } else {
+ Inc->setOperand(1, UF);
+ Plan.getVF().replaceAllUsesWith(
+ Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+ }
removeDeadRecipes(Plan);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 700b94621d5f..1957428fab79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -62,16 +62,47 @@ struct VPlanTransforms {
/// The created loop is wrapped in an initial skeleton to facilitate
/// vectorization, consisting of a vector pre-header, an exit block for the
/// main vector loop (middle.block) and a new block as preheader of the scalar
- /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p
- /// InductionTy and \p IVDL, and creates a VPValue expression for the original
- /// trip count.
+ /// loop (scalar.ph). See below for an illustration. It also adds a canonical
+ /// IV and its increment, using \p InductionTy and \p IVDL, and creates a
+ /// VPValue expression for the original trip count.
+ ///
+ /// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
+ /// / \ old preheader. Will contain iteration number check and SCEV
+ /// | | expansions.
+ /// | |
+ /// / v
+ /// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be
+ /// | / | added later.
+ /// | / v
+ /// || [ ] <-- vector pre header.
+ /// |/ |
+ /// | v
+ /// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized.
+ /// | [ ]_|
+ /// | |
+ /// | v
+ /// | [ ] <--- middle-block with the branch to successors
+ /// | / |
+ /// | / |
+ /// | | v
+ /// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be
+ /// | | replaced later by a VPIRBasicBlock wrapping the scalar
+ /// | | preheader basic block.
+ /// | |
+ /// v <-- edge from middle to exit iff epilogue is not required.
+ /// | [ ] \
+ /// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue,
+ /// | | header wrapped in VPIRBasicBlock).
+ /// \ |
+ /// \ v
+ /// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
PredicatedScalarEvolution &PSE);
/// Update \p Plan to account for all early exits.
- LLVM_ABI_FOR_TEST static void
- handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range);
+ LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
+ bool HasUncountableExit);
/// If a check is needed to guard executing the scalar epilogue loop, it will
/// be added to the middle block.
@@ -79,6 +110,13 @@ struct VPlanTransforms {
bool RequiresScalarEpilogueCheck,
bool TailFolded);
+ // Create a check to \p Plan to see if the vector loop should be executed.
+ static void addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
+ bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
+ const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE);
+
/// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
/// flat CFG into a hierarchical CFG.
LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);
@@ -161,6 +199,12 @@ struct VPlanTransforms {
truncateToMinimalBitwidths(VPlan &Plan,
const MapVector<Instruction *, uint64_t> &MinBWs);
+ /// Replace symbolic strides from \p StridesMap in \p Plan with constants when
+ /// possible.
+ static void
+ replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE,
+ const DenseMap<Value *, const SCEV *> &StridesMap);
+
/// Drop poison flags from recipes that may generate a poison value that is
/// used after vectorization, even when their operands are not poison. Those
/// recipes meet the following conditions:
@@ -207,8 +251,7 @@ struct VPlanTransforms {
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
VPBasicBlock *EarlyExitVPBB,
VPlan &Plan, VPBasicBlock *HeaderVPBB,
- VPBasicBlock *LatchVPBB,
- VFRange &Range);
+ VPBasicBlock *LatchVPBB);
/// Replace loop regions with explicit CFG.
static void dissolveLoopRegions(VPlan &Plan);
@@ -220,9 +263,10 @@ struct VPlanTransforms {
/// variable vector lengths instead of fixed lengths. This transformation:
/// * Makes EVL-Phi concrete.
// * Removes CanonicalIV and increment.
- /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc,
- /// VectorTripCount) with variable-length stepping (branch-on-cond
- /// EVLIVInc, TripCount).
+ /// * Replaces the exit condition from
+ /// (branch-on-count CanonicalIVInc, VectorTripCount)
+ /// to
+ /// (branch-on-cond eq AVLNext, 0)
static void canonicalizeEVLLoops(VPlan &Plan);
/// Lower abstract recipes to concrete ones, that can be codegen'd.
@@ -242,6 +286,9 @@ struct VPlanTransforms {
/// removing dead edges to their successors.
static void removeBranchOnConst(VPlan &Plan);
+ /// Perform common-subexpression-elimination on \p Plan.
+ static void cse(VPlan &Plan);
+
/// If there's a single exit block, optimize its phi recipes that use exiting
/// IV values by feeding them precomputed end values instead, possibly taken
/// one step backwards.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 4bcde8cd5d42..443df167378b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -92,18 +92,18 @@ public:
void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
unsigned Part) {
for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
- auto Ins = VPV2Parts.insert({VPV, {}});
- assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
- Ins.first->second.push_back(CopyR->getVPValue(Idx));
+ const auto &[V, _] = VPV2Parts.try_emplace(VPV);
+ assert(V->second.size() == Part - 1 && "earlier parts not set");
+ V->second.push_back(CopyR->getVPValue(Idx));
}
}
/// Given a uniform recipe \p R, add it for all parts.
void addUniformForAllParts(VPSingleDefRecipe *R) {
- auto Ins = VPV2Parts.insert({R, {}});
- assert(Ins.second && "uniform value already added");
+ const auto &[V, Inserted] = VPV2Parts.try_emplace(R);
+ assert(Inserted && "uniform value already added");
for (unsigned Part = 0; Part != UF; ++Part)
- Ins.first->second.push_back(R);
+ V->second.push_back(R);
}
bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
@@ -536,16 +536,9 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
VPBuilder Builder(RepR);
if (RepR->getNumUsers() == 0) {
- if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
- vputils::isSingleScalar(RepR->getOperand(1))) {
- // Stores to invariant addresses need to store the last lane only.
- cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
- Def2LaneDefs);
- } else {
- // Create single-scalar version of RepR for all lanes.
- for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
- cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
- }
+ // Create single-scalar version of RepR for all lanes.
+ for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
+ cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
RepR->eraseFromParent();
continue;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 700a733bf9f2..c6c1ef336982 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -65,7 +65,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
VPValue *A, *B;
using namespace VPlanPatternMatch;
- if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B))))
+ if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1))))
return B == Plan.getTripCount() &&
(match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()),
m_SpecificInt(1),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 9e1d325a4d8d..77c099b27171 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -49,6 +49,8 @@ inline bool isSingleScalar(const VPValue *VPV) {
case Instruction::GetElementPtr:
case Instruction::ICmp:
case Instruction::FCmp:
+ case Instruction::Select:
+ case VPInstruction::Not:
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef..85c6c2c8d796 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -38,7 +38,7 @@ struct VPDoubleValueDef;
class VPSlotTracker;
class VPUser;
class VPRecipeBase;
-class VPInterleaveRecipe;
+class VPInterleaveBase;
class VPPhiAccessors;
// This is the base class of the VPlan Def/Use graph, used for modeling the data
@@ -48,7 +48,7 @@ class VPPhiAccessors;
class LLVM_ABI_FOR_TEST VPValue {
friend class VPDef;
friend struct VPDoubleValueDef;
- friend class VPInterleaveRecipe;
+ friend class VPInterleaveBase;
friend class VPlan;
friend class VPExpressionRecipe;
@@ -335,6 +335,7 @@ public:
VPExpressionSC,
VPIRInstructionSC,
VPInstructionSC,
+ VPInterleaveEVLSC,
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index e25ffe135418..99f3bc367a54 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
}
return VerifyEVLUse(*R, 2);
})
- .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
+ .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe,
+ VPInterleaveEVLRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
.Case<VPInstructionWithType>(
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
@@ -412,7 +413,7 @@ bool VPlanVerifier::verifyRegion(const VPRegionBlock *Region) {
const VPBlockBase *Exiting = Region->getExiting();
// Entry and Exiting shouldn't have any predecessor/successor, respectively.
- if (Entry->getNumPredecessors() != 0) {
+ if (Entry->hasPredecessors()) {
errs() << "region entry block has predecessors\n";
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 092a3a87954f..17cb18a22336 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -99,6 +99,10 @@ private:
InstructionWorklist Worklist;
+ /// Next instruction to iterate. It will be updated when it is erased by
+ /// RecursivelyDeleteTriviallyDeadInstructions.
+ Instruction *NextInst;
+
// TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
// parameter. That should be updated to specific sub-classes because the
// run loop was changed to dispatch on opcode.
@@ -118,6 +122,7 @@ private:
bool foldInsExtBinop(Instruction &I);
bool foldInsExtVectorToShuffle(Instruction &I);
bool foldBitOpOfCastops(Instruction &I);
+ bool foldBitOpOfCastConstant(Instruction &I);
bool foldBitcastShuffle(Instruction &I);
bool scalarizeOpOrCmp(Instruction &I);
bool scalarizeVPIntrinsic(Instruction &I);
@@ -169,13 +174,16 @@ private:
// further folds that were hindered by OneUse limits.
SmallPtrSet<Value *, 4> Visited;
for (Value *Op : Ops) {
- if (Visited.insert(Op).second) {
+ if (!Visited.contains(Op)) {
if (auto *OpI = dyn_cast<Instruction>(Op)) {
if (RecursivelyDeleteTriviallyDeadInstructions(
- OpI, nullptr, nullptr, [this](Value *V) {
- if (auto I = dyn_cast<Instruction>(V)) {
+ OpI, nullptr, nullptr, [&](Value *V) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
Worklist.remove(I);
+ if (I == NextInst)
+ NextInst = NextInst->getNextNode();
+ Visited.insert(I);
}
}))
continue;
@@ -862,14 +870,17 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
if (LHSSrc->getType() != RHSSrc->getType())
return false;
- // Only handle vector types with integer elements
- auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
- auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
- if (!SrcVecTy || !DstVecTy)
+ auto *SrcTy = LHSSrc->getType();
+ auto *DstTy = I.getType();
+ // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
+ // Other casts only handle vector types with integer elements.
+ if (CastOpcode != Instruction::BitCast &&
+ (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
return false;
- if (!SrcVecTy->getScalarType()->isIntegerTy() ||
- !DstVecTy->getScalarType()->isIntegerTy())
+ // Only integer scalar/vector values are legal for bitwise logic operations.
+ if (!SrcTy->getScalarType()->isIntegerTy() ||
+ !DstTy->getScalarType()->isIntegerTy())
return false;
// Cost Check :
@@ -877,23 +888,21 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
// NewCost = bitlogic + cast
// Calculate specific costs for each cast with instruction context
- InstructionCost LHSCastCost =
- TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
- TTI::CastContextHint::None, CostKind, LHSCast);
- InstructionCost RHSCastCost =
- TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
- TTI::CastContextHint::None, CostKind, RHSCast);
+ InstructionCost LHSCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
+ InstructionCost RHSCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) +
+ TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
LHSCastCost + RHSCastCost;
// For new cost, we can't provide an instruction (it doesn't exist yet)
InstructionCost GenericCastCost = TTI.getCastInstrCost(
- CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
InstructionCost NewCost =
- TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) +
+ TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
GenericCastCost;
// Account for multi-use casts using specific costs
@@ -930,6 +939,102 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
return true;
}
+/// Match:
+// bitop(castop(x), C) ->
+// bitop(castop(x), castop(InvC)) ->
+// castop(bitop(x, InvC))
+// Supports: bitcast
+bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
+ Instruction *LHS;
+ Constant *C;
+
+ // Check if this is a bitwise logic operation
+ if (!match(&I, m_c_BitwiseLogic(m_Instruction(LHS), m_Constant(C))))
+ return false;
+
+ // Get the cast instructions
+ auto *LHSCast = dyn_cast<CastInst>(LHS);
+ if (!LHSCast)
+ return false;
+
+ Instruction::CastOps CastOpcode = LHSCast->getOpcode();
+
+ // Only handle supported cast operations
+ switch (CastOpcode) {
+ case Instruction::BitCast:
+ break;
+ default:
+ return false;
+ }
+
+ Value *LHSSrc = LHSCast->getOperand(0);
+
+ auto *SrcTy = LHSSrc->getType();
+ auto *DstTy = I.getType();
+ // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
+ // Other casts only handle vector types with integer elements.
+ if (CastOpcode != Instruction::BitCast &&
+ (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
+ return false;
+
+ // Only integer scalar/vector values are legal for bitwise logic operations.
+ if (!SrcTy->getScalarType()->isIntegerTy() ||
+ !DstTy->getScalarType()->isIntegerTy())
+ return false;
+
+ // Find the constant InvC, such that castop(InvC) equals to C.
+ PreservedCastFlags RHSFlags;
+ Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
+ if (!InvC)
+ return false;
+
+ // Cost Check :
+ // OldCost = bitlogic + cast
+ // NewCost = bitlogic + cast
+
+ // Calculate specific costs for each cast with instruction context
+ InstructionCost LHSCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
+
+ InstructionCost OldCost =
+ TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
+
+ // For new cost, we can't provide an instruction (it doesn't exist yet)
+ InstructionCost GenericCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
+
+ InstructionCost NewCost =
+ TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
+ GenericCastCost;
+
+ // Account for multi-use casts using specific costs
+ if (!LHSCast->hasOneUse())
+ NewCost += LHSCastCost;
+
+ LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
+ << " NewCost=" << NewCost << "\n");
+
+ if (NewCost > OldCost)
+ return false;
+
+ // Create the operation on the source type
+ Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
+ LHSSrc, InvC, I.getName() + ".inner");
+ if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
+ NewBinOp->copyIRFlags(&I);
+
+ Worklist.pushValue(NewOp);
+
+ // Create the cast operation directly to ensure we get a new instruction
+ Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
+
+ // Insert the new instruction
+ Value *Result = Builder.Insert(NewCast);
+
+ replaceValue(I, *Result);
+ return true;
+}
+
/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
/// destination type followed by shuffle. This can enable further transforms by
/// moving bitcasts or shuffles together.
@@ -1461,8 +1566,8 @@ static void analyzeCostOfVecReduction(const IntrinsicInst &II,
TTI::CastContextHint::None, CostKind, RedOp);
CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
- CostAfterReduction =
- TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind);
+ CostAfterReduction = TTI.getMulAccReductionCost(
+ IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
return;
}
CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
@@ -3753,6 +3858,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
unsigned MaxVectorSize =
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
+ if (MaxElementsInVector == 0)
+ return false;
// When there are multiple shufflevector operations on the same input,
// especially when the vector length is larger than the register size,
// identical shuffle patterns may occur across different groups of elements.
@@ -4467,6 +4574,8 @@ bool VectorCombine::run() {
case Instruction::Xor:
if (foldBitOpOfCastops(I))
return true;
+ if (foldBitOpOfCastConstant(I))
+ return true;
break;
case Instruction::PHI:
if (shrinkPhiOfShuffles(I))
@@ -4519,13 +4628,21 @@ bool VectorCombine::run() {
if (!DT.isReachableFromEntry(&BB))
continue;
// Use early increment range so that we can erase instructions in loop.
- for (Instruction &I : make_early_inc_range(BB)) {
- if (I.isDebugOrPseudoInst())
- continue;
- MadeChange |= FoldInst(I);
+ // make_early_inc_range is not applicable here, as the next iterator may
+ // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
+ // We manually maintain the next instruction and update it when it is about
+ // to be deleted.
+ Instruction *I = &BB.front();
+ while (I) {
+ NextInst = I->getNextNode();
+ if (!I->isDebugOrPseudoInst())
+ MadeChange |= FoldInst(*I);
+ I = NextInst;
}
}
+ NextInst = nullptr;
+
while (!Worklist.isEmpty()) {
Instruction *I = Worklist.removeOne();
if (!I)