summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authorMichael Kruse <llvm-project@meinersbur.de>2025-01-03 10:22:51 +0100
committerMichael Kruse <llvm-project@meinersbur.de>2025-01-03 10:22:51 +0100
commit38500d63e14ce340236840f60d356cdefb56a52c (patch)
tree17edbec446ce9b50d2f215a483b83afb293a635d /llvm/lib/Transforms/Vectorize
parent1a3d5daaef7a6a63448a497da3eff7fc9e23df26 (diff)
parent27f30029741ecf023baece7b3dde1ff9011ffefc (diff)
Merge branch 'main' into users/meinersbur/flang_runtime_split-headersusers/meinersbur/flang_runtime_split-headers
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp38
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h17
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp1046
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp880
-rw-r--r--llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp107
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp272
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h390
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h4
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h16
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp231
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp580
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h13
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp4
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp650
18 files changed, 2575 insertions, 1695 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index f1568781252c..cb0b4641b649 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -666,7 +666,6 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
// Check whether we are able to set up outer loop induction.
if (!setupOuterLoopInductions()) {
reportVectorizationFailure("Unsupported outer loop Phi(s)",
- "Unsupported outer loop Phi(s)",
"UnsupportedPhi", ORE, TheLoop);
if (DoExtraAnalysis)
Result = false;
@@ -927,7 +926,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
auto *SE = PSE.getSE();
Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
- if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) {
if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
TheLoop)) {
reportVectorizationFailure("Found unvectorizable intrinsic",
@@ -962,7 +961,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
Type *T = ST->getValueOperand()->getType();
if (!VectorType::isValidElementType(T)) {
reportVectorizationFailure("Store instruction cannot be vectorized",
- "store instruction cannot be vectorized",
"CantVectorizeStore", ORE, TheLoop, ST);
return false;
}
@@ -976,7 +974,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
reportVectorizationFailure(
"nontemporal store instruction cannot be vectorized",
- "nontemporal store instruction cannot be vectorized",
"CantVectorizeNontemporalStore", ORE, TheLoop, ST);
return false;
}
@@ -991,7 +988,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
reportVectorizationFailure(
"nontemporal load instruction cannot be vectorized",
- "nontemporal load instruction cannot be vectorized",
"CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
return false;
}
@@ -1020,7 +1016,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
reportVectorizationFailure("Value cannot be used outside the loop",
- "value cannot be used outside the loop",
"ValueUsedOutsideLoop", ORE, TheLoop, &I);
return false;
}
@@ -1375,6 +1370,16 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
}
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+ // When vectorizing early exits, create predicates for the latch block only.
+ // The early exiting block must be a direct predecessor of the latch at the
+ // moment.
+ BasicBlock *Latch = TheLoop->getLoopLatch();
+ if (hasUncountableEarlyExit()) {
+ assert(
+ is_contained(predecessors(Latch), getUncountableEarlyExitingBlock()) &&
+ "Uncountable exiting block must be a direct predecessor of latch");
+ return BB == Latch;
+ }
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
@@ -1432,9 +1437,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(
bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!EnableIfConversion) {
reportVectorizationFailure("If-conversion is disabled",
- "if-conversion is disabled",
- "IfConversionDisabled",
- ORE, TheLoop);
+ "IfConversionDisabled", ORE, TheLoop);
return false;
}
@@ -1483,14 +1486,12 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (isa<SwitchInst>(BB->getTerminator())) {
if (TheLoop->isLoopExiting(BB)) {
reportVectorizationFailure("Loop contains an unsupported switch",
- "loop contains an unsupported switch",
"LoopContainsUnsupportedSwitch", ORE,
TheLoop, BB->getTerminator());
return false;
}
} else if (!isa<BranchInst>(BB->getTerminator())) {
reportVectorizationFailure("Loop contains an unsupported terminator",
- "loop contains an unsupported terminator",
"LoopContainsUnsupportedTerminator", ORE,
TheLoop, BB->getTerminator());
return false;
@@ -1500,8 +1501,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (blockNeedsPredication(BB) &&
!blockCanBePredicated(BB, SafePointers, MaskedOp)) {
reportVectorizationFailure(
- "Control flow cannot be substituted for a select",
- "control flow cannot be substituted for a select", "NoCFGForSelect",
+ "Control flow cannot be substituted for a select", "NoCFGForSelect",
ORE, TheLoop, BB->getTerminator());
return false;
}
@@ -1691,8 +1691,6 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
} else if (!IsSafeOperation(&I)) {
reportVectorizationFailure("Early exit loop contains operations that "
"cannot be speculatively executed",
- "Early exit loop contains operations that "
- "cannot be speculatively executed",
"UnsafeOperationsEarlyExitLoop", ORE,
TheLoop);
return false;
@@ -1754,9 +1752,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
if (!canVectorizeOuterLoop()) {
reportVectorizationFailure("Unsupported outer loop",
- "unsupported outer loop",
- "UnsupportedOuterLoop",
- ORE, TheLoop);
+ "UnsupportedOuterLoop", ORE, TheLoop);
// TODO: Implement DoExtraAnalysis when subsequent legal checks support
// outer loops.
return false;
@@ -1788,13 +1784,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
HasUncountableEarlyExit = false;
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
+ HasUncountableEarlyExit = true;
if (!isVectorizableEarlyExitLoop()) {
+ UncountableExitingBlocks.clear();
+ HasUncountableEarlyExit = false;
if (DoExtraAnalysis)
Result = false;
else
return false;
- } else
- HasUncountableEarlyExit = true;
+ }
}
// Go over each instruction and look at memory deps.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index fbcf181a45a6..26a2de8c8097 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -222,21 +222,24 @@ public:
VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
const Twine &Name = "") {
- return tryInsertInstruction(new VPInstruction(
- Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name));
+ return tryInsertInstruction(
+ new VPInstruction(Ptr, Offset, GEPNoWrapFlags::none(), DL, Name));
}
VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
const Twine &Name = "") {
- return tryInsertInstruction(new VPInstruction(
- Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name));
+ return tryInsertInstruction(
+ new VPInstruction(Ptr, Offset, GEPNoWrapFlags::inBounds(), DL, Name));
}
+ /// Convert the input value \p Current to the corresponding value of an
+ /// induction with \p Start and \p Step values, using \p Start + \p Current *
+ /// \p Step.
VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
FPMathOperator *FPBinOp, VPValue *Start,
- VPCanonicalIVPHIRecipe *CanonicalIV,
- VPValue *Step, const Twine &Name = "") {
+ VPValue *Current, VPValue *Step,
+ const Twine &Name = "") {
return tryInsertInstruction(
- new VPDerivedIVRecipe(Kind, FPBinOp, Start, CanonicalIV, Step, Name));
+ new VPDerivedIVRecipe(Kind, FPBinOp, Start, Current, Step, Name));
}
VPScalarCastRecipe *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3c7c044a0427..f2f8a85b7cc2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -385,6 +385,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
cl::Hidden,
cl::desc("Try wider VFs if they enable the use of vector variants"));
+static cl::opt<bool> EnableEarlyExitVectorization(
+ "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Enable vectorization of early exit loops with uncountable exits."));
+
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -474,7 +479,8 @@ public:
AC(AC), ORE(ORE), VF(VecWidth),
MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
- PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+ VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -487,12 +493,11 @@ public:
/// on, while the old loop will be used as the scalar remainder. Control flow
/// is generated around the vectorized (and scalar epilogue) loops consisting
/// of various checks and bypasses. Return the pre-header block of the new
- /// loop and the start value for the canonical induction, if it is != 0. The
- /// latter is the case when vectorizing the epilogue loop. In the case of
- /// epilogue vectorization, this function is overriden to handle the more
- /// complex control flow around the loops. \p ExpandedSCEVs is used to
- /// look up SCEV expansions for expressions needed during skeleton creation.
- virtual std::pair<BasicBlock *, Value *>
+ /// loop. In the case of epilogue vectorization, this function is overriden to
+ /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
+ /// used to look up SCEV expansions for expressions needed during skeleton
+ /// creation.
+ virtual BasicBlock *
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
/// Fix the vectorized code, taking care of header phi's, and more.
@@ -513,18 +518,6 @@ public:
/// Fix the non-induction PHIs in \p Plan.
void fixNonInductionPHIs(VPTransformState &State);
- /// Create a new phi node for the induction variable \p OrigPhi to resume
- /// iteration count in the scalar epilogue, from where the vectorized loop
- /// left off. \p Step is the SCEV-expanded induction step to use. In cases
- /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
- /// and the resume values can come from an additional bypass block, the \p
- /// AdditionalBypass pair provides information about the bypass block and the
- /// end value on the edge from bypass to this loop.
- PHINode *createInductionResumeValue(
- PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
- ArrayRef<BasicBlock *> BypassBlocks,
- std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
-
/// Returns the original loop trip count.
Value *getTripCount() const { return TripCount; }
@@ -533,6 +526,20 @@ public:
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ // Retrieve the additional bypass value associated with an original
+ /// induction header phi.
+ Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
+ return Induction2AdditionalBypassValue.at(OrigPhi);
+ }
+
+ /// Return the additional bypass block which targets the scalar loop by
+ /// skipping the epilogue loop after completing the main loop.
+ BasicBlock *getAdditionalBypassBlock() const {
+ assert(AdditionalBypassBlock &&
+ "Trying to access AdditionalBypassBlock but it has not been set");
+ return AdditionalBypassBlock;
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -566,21 +573,21 @@ protected:
/// vector loop preheader, middle block and scalar preheader.
void createVectorLoopSkeleton(StringRef Prefix);
- /// Create new phi nodes for the induction variables to resume iteration count
- /// in the scalar epilogue, from where the vectorized loop left off.
- /// In cases where the loop skeleton is more complicated (eg. epilogue
- /// vectorization) and the resume values can come from an additional bypass
- /// block, the \p AdditionalBypass pair provides information about the bypass
- /// block and the end value on the edge from bypass to this loop.
- void createInductionResumeValues(
- const SCEV2ValueTy &ExpandedSCEVs,
- std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+ /// Create and record the values for induction variables to resume coming from
+ /// the additional bypass block.
+ void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
+ Value *MainVectorTripCount);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
virtual void printDebugTracesAtStart() {}
virtual void printDebugTracesAtEnd() {}
+ /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
+ /// vector preheader and its predecessor, also connecting the new block to the
+ /// scalar preheader.
+ void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
+
/// The original loop.
Loop *OrigLoop;
@@ -664,7 +671,21 @@ protected:
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
+ /// Mapping of induction phis to their additional bypass values. They
+ /// need to be added as operands to phi nodes in the scalar loop preheader
+ /// after the epilogue skeleton has been created.
+ DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
+
+ /// The additional bypass block which conditionally skips over the epilogue
+ /// loop after executing the main loop. Needed to resume inductions and
+ /// reductions during epilogue vectorization.
+ BasicBlock *AdditionalBypassBlock = nullptr;
+
VPlan &Plan;
+
+ /// The vector preheader block of \p Plan, used as target for check blocks
+ /// introduced during skeleton creation.
+ VPBlockBase *VectorPHVPB;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -681,10 +702,13 @@ struct EpilogueLoopVectorizationInfo {
BasicBlock *MemSafetyCheck = nullptr;
Value *TripCount = nullptr;
Value *VectorTripCount = nullptr;
+ VPlan &EpiloguePlan;
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
- ElementCount EVF, unsigned EUF)
- : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
+ ElementCount EVF, unsigned EUF,
+ VPlan &EpiloguePlan)
+ : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
+ EpiloguePlan(EpiloguePlan) {
assert(EUF == 1 &&
"A high UF for the epilogue loop is likely not beneficial.");
}
@@ -714,15 +738,15 @@ public:
// Override this function to handle the more complex control flow around the
// three loops.
- std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
- const SCEV2ValueTy &ExpandedSCEVs) final {
+ BasicBlock *
+ createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
}
/// The interface for creating a vectorized skeleton using one of two
/// different strategies, each corresponding to one execution of the vplan
/// as described above.
- virtual std::pair<BasicBlock *, Value *>
+ virtual BasicBlock *
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
/// Holds and updates state information required to vectorize the main loop
@@ -751,7 +775,7 @@ public:
EPI, LVL, CM, BFI, PSI, Check, Plan) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
- std::pair<BasicBlock *, Value *>
+ BasicBlock *
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
protected:
@@ -786,7 +810,7 @@ public:
}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (ie the second pass of vplan execution).
- std::pair<BasicBlock *, Value *>
+ BasicBlock *
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
protected:
@@ -1214,8 +1238,8 @@ public:
return false;
// Get the source and destination types of the truncate.
- Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
- Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
+ Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
+ Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
// If the truncate is free for the given types, return false. Replacing a
// free truncate with an induction variable would add an induction variable
@@ -1350,9 +1374,10 @@ public:
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
return false;
}
- // If we might exit from anywhere but the latch, must run the exiting
- // iteration in scalar form.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ // If we might exit from anywhere but the latch and early exit vectorization
+ // is disabled, we must run the exiting iteration in scalar form.
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
+ !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
"from latch block\n");
return true;
@@ -1706,7 +1731,8 @@ private:
bool needsExtract(Value *V, ElementCount VF) const {
Instruction *I = dyn_cast<Instruction>(V);
if (VF.isScalar() || !I || !TheLoop->contains(I) ||
- TheLoop->isLoopInvariant(I))
+ TheLoop->isLoopInvariant(I) ||
+ getWideningDecision(I, VF) == CM_Scalarize)
return false;
// Assume we can vectorize V (and hence we need extraction) if the
@@ -2428,6 +2454,21 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
return VectorTripCount;
}
+void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
+ VPBlockBase *ScalarPH = Plan.getScalarPreheader();
+ VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
+ if (PreVectorPH->getNumSuccessors() != 1) {
+ assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
+ assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
+ "Unexpected successor");
+ VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
+ VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
+ PreVectorPH = CheckVPIRBB;
+ }
+ VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
+ PreVectorPH->swapSuccessors();
+}
+
void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
Value *Count = getTripCount();
// Reuse existing vector loop preheader for TC checks.
@@ -2502,14 +2543,15 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass");
- // Update dominator for Bypass & LoopExit (if needed).
- DT->changeImmediateDominator(Bypass, TCCheckBlock);
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
LoopBypassBlocks.push_back(TCCheckBlock);
+
+ // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
+ introduceCheckBlockInVPlan(TCCheckBlock);
}
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
@@ -2526,6 +2568,8 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
"Should already be a bypass block due to iteration count check");
LoopBypassBlocks.push_back(SCEVCheckBlock);
AddedSafetyChecks = true;
+
+ introduceCheckBlockInVPlan(SCEVCheckBlock);
return SCEVCheckBlock;
}
@@ -2562,80 +2606,40 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
AddedSafetyChecks = true;
+ introduceCheckBlockInVPlan(MemCheckBlock);
return MemCheckBlock;
}
+/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
+/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
+/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
+/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
+static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+ VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
+ for (auto &R : make_early_inc_range(*VPBB)) {
+ assert(!R.isPhi() && "Tried to move phi recipe to end of block");
+ R.moveBefore(*IRVPBB, IRVPBB->end());
+ }
+
+ VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
+ // VPBB is now dead and will be cleaned up when the plan gets destroyed.
+}
+
void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
- assert((OrigLoop->getUniqueExitBlock() ||
+ assert((OrigLoop->getUniqueLatchExitBlock() ||
Cost->requiresScalarEpilogue(VF.isVector())) &&
- "multiple exit loop without required epilogue?");
+ "loops not exiting via the latch without required epilogue?");
LoopMiddleBlock =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
LI, nullptr, Twine(Prefix) + "middle.block");
+ replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
LoopScalarPreHeader =
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
nullptr, Twine(Prefix) + "scalar.ph");
-}
-
-PHINode *InnerLoopVectorizer::createInductionResumeValue(
- PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
- ArrayRef<BasicBlock *> BypassBlocks,
- std::pair<BasicBlock *, Value *> AdditionalBypass) {
- Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
- assert(VectorTripCount && "Expected valid arguments");
-
- Instruction *OldInduction = Legal->getPrimaryInduction();
- Value *EndValue = nullptr;
- Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
- if (OrigPhi == OldInduction) {
- // We know what the end value is.
- EndValue = VectorTripCount;
- } else {
- IRBuilder<> B(LoopVectorPreHeader->getTerminator());
-
- // Fast-math-flags propagate from the original induction instruction.
- if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
- B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
-
- EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
- Step, II.getKind(), II.getInductionBinOp());
- EndValue->setName("ind.end");
-
- // Compute the end value for the additional bypass (if applicable).
- if (AdditionalBypass.first) {
- B.SetInsertPoint(AdditionalBypass.first,
- AdditionalBypass.first->getFirstInsertionPt());
- EndValueFromAdditionalBypass =
- emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
- Step, II.getKind(), II.getInductionBinOp());
- EndValueFromAdditionalBypass->setName("ind.end");
- }
- }
-
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getFirstNonPHIIt());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
-
- // The new PHI merges the original incoming value, in case of a bypass,
- // or the value at the end of the vectorized loop.
- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
-
- // Fix the scalar body counter (PHI node).
- // The old induction's phi node in the scalar body needs the truncated
- // value.
- for (BasicBlock *BB : BypassBlocks)
- BCResumeVal->addIncoming(II.getStartValue(), BB);
-
- if (AdditionalBypass.first)
- BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
- EndValueFromAdditionalBypass);
- return BCResumeVal;
+ replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2652,31 +2656,66 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
return I->second;
}
-void InnerLoopVectorizer::createInductionResumeValues(
- const SCEV2ValueTy &ExpandedSCEVs,
- std::pair<BasicBlock *, Value *> AdditionalBypass) {
- assert(((AdditionalBypass.first && AdditionalBypass.second) ||
- (!AdditionalBypass.first && !AdditionalBypass.second)) &&
- "Inconsistent information about additional bypass.");
- // We are going to resume the execution of the scalar loop.
- // Go over all of the induction variables that we found and fix the
- // PHIs that are left in the scalar version of the loop.
- // The starting values of PHI nodes depend on the counter of the last
- // iteration in the vectorized loop.
- // If we come from a bypass edge then we need to start from the original
- // start value.
+/// Knowing that loop \p L executes a single vector iteration, add instructions
+/// that will get simplified and thus should not have any cost to \p
+/// InstsToIgnore.
+static void addFullyUnrolledInstructionsToIgnore(
+ Loop *L, const LoopVectorizationLegality::InductionList &IL,
+ SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
+ auto *Cmp = L->getLatchCmpInst();
+ if (Cmp)
+ InstsToIgnore.insert(Cmp);
+ for (const auto &KV : IL) {
+ // Extract the key by hand so that it can be used in the lambda below. Note
+ // that captured structured bindings are a C++20 extension.
+ const PHINode *IV = KV.first;
+
+ // Get next iteration value of the induction variable.
+ Instruction *IVInst =
+ cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
+ if (all_of(IVInst->users(),
+ [&](const User *U) { return U == IV || U == Cmp; }))
+ InstsToIgnore.insert(IVInst);
+ }
+}
+
+void InnerLoopVectorizer::createInductionAdditionalBypassValues(
+ const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
+ assert(MainVectorTripCount && "Must have bypass information");
+
+ Instruction *OldInduction = Legal->getPrimaryInduction();
+ IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
+ getAdditionalBypassBlock()->getFirstInsertionPt());
for (const auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
const InductionDescriptor &II = InductionEntry.second;
- PHINode *BCResumeVal = createInductionResumeValue(
- OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
- AdditionalBypass);
- OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
+ Value *Step = getExpandedStep(II, ExpandedSCEVs);
+ // For the primary induction the additional bypass end value is known.
+ // Otherwise it is computed.
+ Value *EndValueFromAdditionalBypass = MainVectorTripCount;
+ if (OrigPhi != OldInduction) {
+ auto *BinOp = II.getInductionBinOp();
+ // Fast-math-flags propagate from the original induction instruction.
+ if (isa_and_nonnull<FPMathOperator>(BinOp))
+ BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
+
+ // Compute the end value for the additional bypass.
+ EndValueFromAdditionalBypass =
+ emitTransformedIndex(BypassBuilder, MainVectorTripCount,
+ II.getStartValue(), Step, II.getKind(), BinOp);
+ EndValueFromAdditionalBypass->setName("ind.end");
+ }
+
+ // Store the bypass value here, as it needs to be added as operand to its
+ // scalar preheader phi node after the epilogue skeleton has been created.
+ // TODO: Directly add as extra operand to the VPResumePHI recipe.
+ assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
+ "entry for OrigPhi already exits");
+ Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
}
}
-std::pair<BasicBlock *, Value *>
-InnerLoopVectorizer::createVectorizedLoopSkeleton(
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) {
/*
In this function we generate a new loop. The new loop will contain
@@ -2733,10 +2772,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
// faster.
emitMemRuntimeChecks(LoopScalarPreHeader);
- // Emit phis for the new starting index of the scalar loop.
- createInductionResumeValues(ExpandedSCEVs);
-
- return {LoopVectorPreHeader, nullptr};
+ return LoopVectorPreHeader;
}
// Fix up external users of the induction variable. At this point, we are
@@ -2753,8 +2789,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.
- assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
-
DenseMap<Value *, Value *> MissingVals;
Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
@@ -2808,6 +2842,18 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
}
}
+ assert((MissingVals.empty() ||
+ all_of(MissingVals,
+ [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
+ return all_of(
+ predecessors(cast<Instruction>(P.first)->getParent()),
+ [MiddleBlock, this](BasicBlock *Pred) {
+ return Pred == MiddleBlock ||
+ Pred == OrigLoop->getLoopLatch();
+ });
+ })) &&
+ "Expected escaping values from latch/middle.block only");
+
for (auto &I : MissingVals) {
PHINode *PHI = cast<PHINode>(I.first);
// One corner case we have to handle is two IVs "chasing" each-other,
@@ -3411,14 +3457,14 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
}
InstructionCost SafeDivisorCost = 0;
- auto *VecTy = ToVectorTy(I->getType(), VF);
+ auto *VecTy = toVectorTy(I->getType(), VF);
// The cost of the select guard to ensure all lanes are well defined
// after we speculate above any internal control flow.
- SafeDivisorCost += TTI.getCmpSelInstrCost(
- Instruction::Select, VecTy,
- ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ SafeDivisorCost +=
+ TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
+ toVectorTy(Type::getInt1Ty(I->getContext()), VF),
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
@@ -3585,10 +3631,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// Start with the conditional branches exiting the loop. If the branch
// condition is an instruction contained in the loop that is only used by the
- // branch, it is uniform.
+ // branch, it is uniform. Note conditions from uncountable early exits are not
+ // uniform.
SmallVector<BasicBlock *> Exiting;
TheLoop->getExitingBlocks(Exiting);
for (BasicBlock *E : Exiting) {
+ if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
+ continue;
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
AddToWorklistIfAllowed(Cmp);
@@ -4147,7 +4196,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (TC == 0) {
reportVectorizationFailure(
- "Unable to calculate the loop count due to complex control flow",
"unable to calculate the loop count due to complex control flow",
"UnknownLoopCountComplexCFG", ORE, TheLoop);
return FixedScalableVFPair::getNone();
@@ -4536,7 +4584,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
}
auto WillWiden = [&TTI, VF](Type *ScalarTy) {
- Type *VectorTy = ToVectorTy(ScalarTy, VF);
+ Type *VectorTy = toVectorTy(ScalarTy, VF);
unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
if (!NumLegalParts)
return false;
@@ -4673,6 +4721,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
+ // TODO: Add support for loops with an early exit.
if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
return false;
@@ -4921,6 +4970,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (!Legal->isSafeForAnyVectorWidth())
return 1;
+ // We don't attempt to perform interleaving for loops with uncountable early
+ // exits because the VPInstruction::AnyOf code cannot currently handle
+ // multiple parts.
+ if (Legal->hasUncountableEarlyExit())
+ return 1;
+
auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
const bool HasReductions = !Legal->getReductionVars().empty();
@@ -5105,8 +5160,9 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
HasReductions &&
any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
const RecurrenceDescriptor &RdxDesc = Reduction.second;
- return RecurrenceDescriptor::isAnyOfRecurrenceKind(
- RdxDesc.getRecurrenceKind());
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
});
if (HasSelectCmpReductions) {
LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
@@ -5519,7 +5575,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
- cast<VectorType>(ToVectorTy(I->getType(), VF)),
+ cast<VectorType>(toVectorTy(I->getType(), VF)),
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
/*Extract*/ false, CostKind);
ScalarCost +=
@@ -5538,7 +5594,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
Worklist.push_back(J);
else if (needsExtract(J, VF)) {
ScalarCost += TTI.getScalarizationOverhead(
- cast<VectorType>(ToVectorTy(J->getType(), VF)),
+ cast<VectorType>(toVectorTy(J->getType(), VF)),
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
/*Extract*/ true, CostKind);
}
@@ -5559,6 +5615,15 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
InstructionCost Cost;
+ // If the vector loop gets executed exactly once with the given VF, ignore the
+ // costs of comparison and induction instructions, as they'll get simplified
+ // away.
+ SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
+ auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
+ addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
+ ValuesToIgnoreForVF);
+
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
InstructionCost BlockCost;
@@ -5566,7 +5631,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
// For each instruction in the old loop.
for (Instruction &I : BB->instructionsWithoutDebug()) {
// Skip ignored values.
- if (ValuesToIgnore.count(&I) ||
+ if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
(VF.isVector() && VecValuesToIgnore.count(&I)))
continue;
@@ -5640,7 +5705,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
unsigned AS = getLoadStoreAddressSpace(I);
Value *Ptr = getLoadStorePointerOperand(I);
- Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+ Type *PtrTy = toVectorTy(Ptr->getType(), VF);
// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
// that it is being called from this specific place.
@@ -5691,7 +5756,7 @@ InstructionCost
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
ElementCount VF) {
Type *ValTy = getLoadStoreType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
@@ -5723,7 +5788,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
assert(Legal->isUniformMemOp(*I, VF));
Type *ValTy = getLoadStoreType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -5749,7 +5814,7 @@ InstructionCost
LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
ElementCount VF) {
Type *ValTy = getLoadStoreType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
const Value *Ptr = getLoadStorePointerOperand(I);
@@ -5767,7 +5832,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Instruction *InsertPos = Group->getInsertPos();
Type *ValTy = getLoadStoreType(InsertPos);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -6012,7 +6077,7 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
return 0;
InstructionCost Cost = 0;
- Type *RetTy = ToVectorTy(I->getType(), VF);
+ Type *RetTy = toVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(
@@ -6278,9 +6343,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
bool MaskRequired = Legal->isMaskRequired(CI);
// Compute corresponding vector type for return value and arguments.
- Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+ Type *RetTy = toVectorTy(ScalarRetTy, VF);
for (Type *ScalarTy : ScalarTys)
- Tys.push_back(ToVectorTy(ScalarTy, VF));
+ Tys.push_back(toVectorTy(ScalarTy, VF));
// An in-loop reduction using an fmuladd intrinsic is a special case;
// we don't want the normal cost for that intrinsic.
@@ -6470,7 +6535,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
HasSingleCopyAfterVectorization(I, VF));
VectorTy = RetTy;
} else
- VectorTy = ToVectorTy(RetTy, VF);
+ VectorTy = toVectorTy(RetTy, VF);
if (VF.isVector() && VectorTy->isVectorTy() &&
!TTI.getNumberOfParts(VectorTy))
@@ -6530,8 +6595,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return Switch->getNumCases() *
TTI.getCmpSelInstrCost(
Instruction::ICmp,
- ToVectorTy(Switch->getCondition()->getType(), VF),
- ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
+ toVectorTy(Switch->getCondition()->getType(), VF),
+ toVectorTy(Type::getInt1Ty(I->getContext()), VF),
CmpInst::ICMP_EQ, CostKind);
}
case Instruction::PHI: {
@@ -6576,8 +6641,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
return (Phi->getNumIncomingValues() - 1) *
TTI.getCmpSelInstrCost(
- Instruction::Select, ToVectorTy(ResultTy, VF),
- ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
+ Instruction::Select, toVectorTy(ResultTy, VF),
+ toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
CmpInst::BAD_ICMP_PREDICATE, CostKind);
}
@@ -6586,8 +6651,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (VF.isVector() && foldTailWithEVL() &&
Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
IntrinsicCostAttributes ICA(
- Intrinsic::vp_merge, ToVectorTy(Phi->getType(), VF),
- {ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
+ Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
+ {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
return TTI.getIntrinsicInstrCost(ICA, CostKind);
}
@@ -6727,7 +6792,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
}
- VectorTy = ToVectorTy(ValTy, VF);
+ VectorTy = toVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
cast<CmpInst>(I)->getPredicate(), CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
@@ -6745,7 +6810,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (Decision == CM_Scalarize)
Width = ElementCount::getFixed(1);
}
- VectorTy = ToVectorTy(getLoadStoreType(I), Width);
+ VectorTy = toVectorTy(getLoadStoreType(I), Width);
return getMemoryInstructionCost(I, VF);
}
case Instruction::BitCast:
@@ -6826,7 +6891,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
SrcScalarTy =
IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
Type *SrcVecTy =
- VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
+ VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
if (canTruncateToMinimalBitwidth(I, VF)) {
// If the result type is <= the source type, there will be no extend
@@ -7248,6 +7313,17 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
continue;
IVInsts.push_back(CI);
}
+
+ // If the vector loop gets executed exactly once with the given VF, ignore
+ // the costs of comparison and induction instructions, as they'll get
+ // simplified away.
+ // TODO: Remove this code after stepping away from the legacy cost model and
+ // adding code to simplify VPlans before calculating their costs.
+ auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
+ if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
+ addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
+ CostCtx.SkipCostComputation);
+
for (Instruction *IVInst : IVInsts) {
if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
continue;
@@ -7344,7 +7420,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
// Pre-compute the cost for I, if it has a reduction pattern cost.
for (Instruction *I : ChainOpsAndOperands) {
auto ReductionCost = CM.getReductionPatternCost(
- I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+ I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
if (!ReductionCost)
continue;
@@ -7584,7 +7660,8 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
// fix the reduction's scalar PHI node by adding the incoming value from the
// main vector loop.
static void fixReductionScalarResumeWhenVectorizingEpilog(
- VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock) {
+ VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
+ BasicBlock *BypassBlock) {
auto *EpiRedResult = dyn_cast<VPInstruction>(R);
if (!EpiRedResult ||
EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
@@ -7621,21 +7698,8 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
auto *EpiResumePhiVPI =
cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
- BasicBlock *LoopScalarPreHeader = EpiResumePhi->getParent();
- bool Updated = false;
- for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
- if (is_contained(MainResumePhi->blocks(), Incoming)) {
- assert(EpiResumePhi->getIncomingValueForBlock(Incoming) ==
- RdxDesc.getRecurrenceStartValue() &&
- "Trying to reset unexpected value");
- assert(!Updated && "Should update at most 1 incoming value");
- EpiResumePhi->setIncomingValueForBlock(
- Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
- Updated = true;
- }
- }
- assert(Updated && "Must update EpiResumePhi.");
- (void)Updated;
+ EpiResumePhi->setIncomingValueForBlock(
+ BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
}
DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
@@ -7656,23 +7720,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::unrollByUF(BestVPlan, BestUF,
OrigLoop->getHeader()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
-
- LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
- << ", UF=" << BestUF << '\n');
- BestVPlan.setName("Final VPlan");
- LLVM_DEBUG(BestVPlan.dump());
+ VPlanTransforms::convertToConcreteRecipes(BestVPlan);
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
- &BestVPlan);
+ &BestVPlan, OrigLoop->getParentLoop(),
+ Legal->getWidestInductionType());
+
+#ifdef EXPENSIVE_CHECKS
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+#endif
+
+ // 0. Generate SCEV-dependent code in the entry, including TripCount, before
+ // making any changes to the CFG.
+ if (!BestVPlan.getEntry()->empty())
+ BestVPlan.getEntry()->execute(&State);
- // 0. Generate SCEV-dependent code into the preheader, including TripCount,
- // before making any changes to the CFG.
- if (!BestVPlan.getPreheader()->empty()) {
- State.CFG.PrevBB = OrigLoop->getLoopPreheader();
- State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
- BestVPlan.getPreheader()->execute(&State);
- }
if (!ILV.getTripCount())
ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
else
@@ -7681,13 +7744,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
- Value *CanonicalIVStartValue;
- std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
- ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
- : State.ExpandedSCEVs);
-#ifdef EXPENSIVE_CHECKS
- assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-#endif
+ State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
+ ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
+ if (VectorizingEpilogue)
+ VPlanTransforms::removeDeadRecipes(BestVPlan);
// Only use noalias metadata when using memory checks guaranteeing no overlap
// across all iterations.
@@ -7718,20 +7778,31 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- BestVPlan.prepareToExecute(ILV.getTripCount(),
- ILV.getOrCreateVectorTripCount(nullptr),
- CanonicalIVStartValue, State);
- VPlanTransforms::prepareToExecute(BestVPlan);
+ BestVPlan.prepareToExecute(
+ ILV.getTripCount(),
+ ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
+ replaceVPBBWithIRVPBB(BestVPlan.getVectorPreheader(), State.CFG.PrevBB);
BestVPlan.execute(&State);
- // 2.5 Collect reduction resume values.
auto *ExitVPBB = BestVPlan.getMiddleBlock();
- if (VectorizingEpilogue)
+ // 2.5 When vectorizing the epilogue, fix reduction and induction resume
+ // values from the additional bypass block.
+ if (VectorizingEpilogue) {
+ assert(!ILV.Legal->hasUncountableEarlyExit() &&
+ "Epilogue vectorisation not yet supported with early exits");
+ BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
for (VPRecipeBase &R : *ExitVPBB) {
fixReductionScalarResumeWhenVectorizingEpilog(
- &R, State, State.CFG.VPBB2IRBB[ExitVPBB]);
+ &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock);
+ }
+ BasicBlock *PH = OrigLoop->getLoopPreheader();
+ for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+ Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
+ Inc->setIncomingValueForBlock(BypassBlock, V);
}
+ }
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
@@ -7758,7 +7829,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
}
TargetTransformInfo::UnrollingPreferences UP;
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
- if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
+ if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
addRuntimeUnrollDisableMetaData(L);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
@@ -7788,8 +7859,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-std::pair<BasicBlock *, Value *>
-EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
+BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) {
createVectorLoopSkeleton("");
@@ -7820,12 +7890,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
// Generate the induction variable.
EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
- // Skip induction resume value creation here because they will be created in
- // the second pass for the scalar loop. The induction resume values for the
- // inductions in the epilogue loop are created before executing the plan for
- // the epilogue loop.
-
- return {LoopVectorPreHeader, nullptr};
+ return LoopVectorPreHeader;
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7880,8 +7945,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass");
- // Update dominator for Bypass.
- DT->changeImmediateDominator(Bypass, TCCheckBlock);
LoopBypassBlocks.push_back(TCCheckBlock);
// Save the trip count so we don't have to regenerate it in the
@@ -7896,6 +7959,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
+ introduceCheckBlockInVPlan(TCCheckBlock);
return TCCheckBlock;
}
@@ -7905,7 +7969,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-std::pair<BasicBlock *, Value *>
+BasicBlock *
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) {
createVectorLoopSkeleton("vec.epilog.");
@@ -7918,6 +7982,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
nullptr, "vec.epilog.iter.check", true);
emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
VecEpilogueIterationCountCheck);
+ AdditionalBypassBlock = VecEpilogueIterationCountCheck;
// Adjust the control flow taking the state info from the main loop
// vectorization into account.
@@ -7926,9 +7991,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopVectorPreHeader);
- DT->changeImmediateDominator(LoopVectorPreHeader,
- EPI.MainLoopIterationCountCheck);
-
EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
@@ -7939,19 +8001,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
- DT->changeImmediateDominator(
- VecEpilogueIterationCountCheck,
- VecEpilogueIterationCountCheck->getSinglePredecessor());
-
DT->changeImmediateDominator(LoopScalarPreHeader,
EPI.EpilogueIterationCountCheck);
- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
- // If there is an epilogue which must run, there's no edge from the
- // middle block to exit blocks and thus no need to update the immediate
- // dominator of the exit blocks.
- DT->changeImmediateDominator(OrigLoop->getUniqueLatchExitBlock(),
- EPI.EpilogueIterationCountCheck);
-
// Keep track of bypass blocks, as they feed start values to the induction and
// reduction phis in the scalar loop preheader.
if (EPI.SCEVSafetyCheck)
@@ -7988,27 +8039,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
Phi->removeIncomingValue(EPI.MemSafetyCheck);
}
- // Generate a resume induction for the vector epilogue and put it in the
- // vector epilogue preheader
- Type *IdxTy = Legal->getWidestInductionType();
- PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
- EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
- EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
- EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
- EPI.MainLoopIterationCountCheck);
-
- // Generate induction resume values. These variables save the new starting
- // indexes for the scalar loop. They are used to test if there are any tail
- // iterations left once the vector loop has completed.
- // Note that when the vectorized epilogue is skipped due to iteration count
- // check, then the resume value for the induction variable comes from
- // the trip count of the main vector loop, hence passing the AdditionalBypass
- // argument.
- createInductionResumeValues(ExpandedSCEVs,
- {VecEpilogueIterationCountCheck,
- EPI.VectorTripCount} /* AdditionalBypass */);
-
- return {LoopVectorPreHeader, EPResumeVal};
+ // Generate bypass values from the additional bypass block. Note that when the
+ // vectorized epilogue is skipped due to iteration count check, then the
+ // resume value for the induction variable comes from the trip count of the
+ // main vector loop, passed as the second argument.
+ createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
+ return LoopVectorPreHeader;
}
BasicBlock *
@@ -8054,6 +8090,16 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
}
ReplaceInstWithInst(Insert->getTerminator(), &BI);
LoopBypassBlocks.push_back(Insert);
+
+ // A new entry block has been created for the epilogue VPlan. Hook it in, as
+ // otherwise we would try to modify the entry to the main vector loop.
+ VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
+ VPBasicBlock *OldEntry = Plan.getEntry();
+ VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
+ Plan.setEntry(NewEntry);
+ // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
+
+ introduceCheckBlockInVPlan(Insert);
return Insert;
}
@@ -8160,8 +8206,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
// If source is an exiting block, we know the exit edge is dynamically dead
// in the vector loop, and thus we don't need to restrict the mask. Avoid
- // adding uses of an otherwise potentially dead instruction.
- if (OrigLoop->isLoopExiting(Src))
+ // adding uses of an otherwise potentially dead instruction unless we are
+ // vectorizing a loop with uncountable exits. In that case, we always
+ // materialize the mask.
+ if (OrigLoop->isLoopExiting(Src) &&
+ Src != Legal->getUncountableEarlyExitingBlock())
return EdgeMaskCache[Edge] = SrcMask;
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8297,10 +8346,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
if (Reverse)
VectorPtr = new VPReverseVectorPointerRecipe(
Ptr, &Plan.getVF(), getLoadStoreType(I),
- GEP ? GEP->isInBounds() : false, I->getDebugLoc());
+ GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
+ : GEPNoWrapFlags::none(),
+ I->getDebugLoc());
else
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
- GEP ? GEP->isInBounds() : false,
+ GEP ? GEP->getNoWrapFlags()
+ : GEPNoWrapFlags::none(),
I->getDebugLoc());
Builder.getInsertBlock()->appendRecipe(VectorPtr);
Ptr = VectorPtr;
@@ -8329,11 +8381,12 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc, TruncI);
+ IndDesc, TruncI,
+ TruncI->getDebugLoc());
}
assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc);
+ IndDesc, Phi->getDebugLoc());
}
VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
@@ -8355,7 +8408,8 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
[&](ElementCount VF) {
return CM.isScalarAfterVectorization(Phi, VF);
},
- Range));
+ Range),
+ Phi->getDebugLoc());
}
return nullptr;
}
@@ -8809,14 +8863,55 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
}
-/// Create resume phis in the scalar preheader for first-order recurrences and
-/// reductions and update the VPIRInstructions wrapping the original phis in the
-/// scalar header.
+/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
+/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
+/// the end value of the induction.
+static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
+ VPBuilder &VectorPHBuilder,
+ VPBuilder &ScalarPHBuilder,
+ VPTypeAnalysis &TypeInfo,
+ VPValue *VectorTC) {
+ auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+ // Truncated wide inductions resume from the last lane of their vector value
+ // in the last vector iteration which is handled elsewhere.
+ if (WideIntOrFp && WideIntOrFp->getTruncInst())
+ return nullptr;
+
+ VPValue *Start = WideIV->getStartValue();
+ VPValue *Step = WideIV->getStepValue();
+ const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+ VPValue *EndValue = VectorTC;
+ if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+ EndValue = VectorPHBuilder.createDerivedIV(
+ ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
+ Start, VectorTC, Step);
+ }
+
+ // EndValue is derived from the vector trip count (which has the same type as
+ // the widest induction) and thus may be wider than the induction here.
+ Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
+ if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
+ EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
+ ScalarTypeOfWideIV);
+ }
+
+ auto *ResumePhiRecipe =
+ ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
+ WideIV->getDebugLoc(), "bc.resume.val");
+ return ResumePhiRecipe;
+}
+
+/// Create resume phis in the scalar preheader for first-order recurrences,
+/// reductions and inductions, and update the VPIRInstructions wrapping the
+/// original phis in the scalar header.
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
- VPBuilder ScalarPHBuilder(ScalarPH);
+ VPBuilder VectorPHBuilder(
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+ VPBuilder ScalarPHBuilder(ScalarPH);
VPValue *OneVPV = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
@@ -8824,9 +8919,23 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
if (!ScalarPhiI)
break;
+
auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
- if (!isa<VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe>(VectorPhiR))
+ if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
+ if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
+ WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
+ &Plan.getVectorTripCount())) {
+ ScalarPhiIRI->addOperand(ResumePhi);
+ continue;
+ }
+ // TODO: Also handle truncated inductions here. Computing end-values
+ // separately should be done as VPlan-to-VPlan optimization, after
+ // legalizing all resume values to use the last lane from the loop.
+ assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
+ "should only skip truncated wide inductions");
continue;
+ }
+
// The backedge value provides the value to resume coming out of a loop,
// which for FORs is a vector whose last element needs to be extracted. The
// start value provides the value if the loop is bypassed.
@@ -8852,14 +8961,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+ auto *MiddleVPBB = Plan.getMiddleBlock();
SetVector<VPIRInstruction *> ExitUsersToFix;
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
- BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
- BasicBlock *ExitingBB = find_singleton<BasicBlock>(
- to_vector(predecessors(ExitBB)),
- [OrigLoop](BasicBlock *Pred, bool AllowRepeats) {
- return OrigLoop->contains(Pred) ? Pred : nullptr;
- });
for (VPRecipeBase &R : *ExitVPBB) {
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
if (!ExitIRI)
@@ -8867,35 +8971,48 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
if (!ExitPhi)
break;
- Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
- VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
- // Exit values for inductions are computed and updated outside of VPlan
- // and independent of induction recipes.
- // TODO: Compute induction exit values in VPlan.
- if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
- !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
- isa<VPWidenPointerInductionRecipe>(V) ||
- (isa<Instruction>(IncomingValue) &&
- OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
- any_of(IncomingValue->users(), [&Inductions](User *U) {
- auto *P = dyn_cast<PHINode>(U);
- return P && Inductions.contains(P);
- })))
- continue;
- ExitUsersToFix.insert(ExitIRI);
- ExitIRI->addOperand(V);
+ for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
+ BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
+ if (PredVPBB != MiddleVPBB) {
+ SmallVector<BasicBlock *> ExitingBlocks;
+ OrigLoop->getExitingBlocks(ExitingBlocks);
+ assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
+ ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
+ : ExitingBlocks[0];
+ }
+ Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+ VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
+ // Exit values for inductions are computed and updated outside of VPlan
+ // and independent of induction recipes.
+ // TODO: Compute induction exit values in VPlan.
+ if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+ !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+ isa<VPWidenPointerInductionRecipe>(V) ||
+ (isa<Instruction>(IncomingValue) &&
+ OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
+ any_of(IncomingValue->users(), [&Inductions](User *U) {
+ auto *P = dyn_cast<PHINode>(U);
+ return P && Inductions.contains(P);
+ }))) {
+ if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
+ continue;
+ }
+ ExitUsersToFix.insert(ExitIRI);
+ ExitIRI->addOperand(V);
+ }
}
}
return ExitUsersToFix;
}
// Add exit values to \p Plan. Extracts are added for each entry in \p
-// ExitUsersToFix if needed and their operands are updated.
-static void
+// ExitUsersToFix if needed and their operands are updated. Returns true if all
+// exit users can be handled, otherwise return false.
+static bool
addUsersInExitBlocks(VPlan &Plan,
const SetVector<VPIRInstruction *> &ExitUsersToFix) {
if (ExitUsersToFix.empty())
- return;
+ return true;
auto *MiddleVPBB = Plan.getMiddleBlock();
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
@@ -8903,20 +9020,25 @@ addUsersInExitBlocks(VPlan &Plan,
// Introduce extract for exiting values and update the VPIRInstructions
// modeling the corresponding LCSSA phis.
for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
- VPValue *V = ExitIRI->getOperand(0);
- // Pass live-in values used by exit phis directly through to their users in
- // the exit block.
- if (V->isLiveIn())
- continue;
+ for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
+ // Pass live-in values used by exit phis directly through to their users
+ // in the exit block.
+ if (Op->isLiveIn())
+ continue;
+
+ // Currently only live-ins can be used by exit values from blocks not
+ // exiting via the vector latch through to the middle block.
+ if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
+ return false;
- assert(ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
- "Exit value not handled yet for this edge.");
- LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
- VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
- {V, Plan.getOrAddLiveIn(ConstantInt::get(
- IntegerType::get(Ctx, 32), 1))});
- ExitIRI->setOperand(0, Ext);
+ LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
+ VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
+ {Op, Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(Ctx, 32), 1))});
+ ExitIRI->setOperand(Idx, Ext);
+ }
}
+ return true;
}
/// Handle users in the exit block for first order reductions in the original
@@ -9176,7 +9298,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->appendRecipe(Recipe);
}
- VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+ VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
@@ -9189,11 +9311,22 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
+ if (auto *UncountableExitingBlock =
+ Legal->getUncountableEarlyExitingBlock()) {
+ VPlanTransforms::handleUncountableEarlyExit(
+ *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
+ }
addScalarResumePhis(RecipeBuilder, *Plan);
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
- addUsersInExitBlocks(*Plan, ExitUsersToFix);
+ if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
+ reportVectorizationFailure(
+ "Some exit values in loop with uncountable exit not supported yet",
+ "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
+ return nullptr;
+ }
+
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
@@ -9304,6 +9437,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
bool HasNUW = true;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
DebugLoc());
+
+ // Collect mapping of IR header phis to header phi recipes, to be used in
+ // addScalarResumePhis.
+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+ for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ if (isa<VPCanonicalIVPHIRecipe>(&R))
+ continue;
+ auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
+ RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
+ }
+ addScalarResumePhis(RecipeBuilder, *Plan);
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
@@ -9334,8 +9479,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
RecurKind Kind = RdxDesc.getRecurrenceKind();
- assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
- "AnyOf reductions are not allowed for in-loop reductions");
+ assert(
+ !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
+ !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
+ "AnyOf and FindLast reductions are not allowed for in-loop reductions");
// Collect the chain of "link" recipes for the reduction starting at PhiR.
SetVector<VPSingleDefRecipe *> Worklist;
@@ -9439,9 +9586,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (CM.blockNeedsPredicationForAnyReason(BB))
CondOp = RecipeBuilder.getBlockInMask(BB);
- VPReductionRecipe *RedRecipe =
- new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
- CondOp, CM.useOrderedReductions(RdxDesc));
+ auto *RedRecipe = new VPReductionRecipe(
+ RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
+ CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
// Append the recipe to the end of the VPBasicBlock because we need to
// ensure that it comes after all of it's inputs, including CondOp.
// Note that this transformation may leave over dead recipes (including
@@ -9566,6 +9713,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// Convert the reduction phi to operate on bools.
PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
OrigLoop->getHeader()->getContext())));
+ continue;
+ }
+
+ if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ // Adjust the start value for FindLastIV recurrences to use the sentinel
+ // value after generating the ResumePhi recipe, which uses the original
+ // start value.
+ PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
}
}
@@ -9581,13 +9737,18 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
Value *Step = State.get(getStepValue(), VPLane(0));
- Value *CanonicalIV = State.get(getOperand(1), VPLane(0));
+ Value *Index = State.get(getOperand(1), VPLane(0));
Value *DerivedIV = emitTransformedIndex(
- State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
- Kind, cast_if_present<BinaryOperator>(FPBinOp));
+ State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
+ cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName(Name);
- assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
-
+ // If index is the vector trip count, the concrete value will only be set in
+ // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
+ // TODO: Remove the special case for the vector trip count once it is computed
+ // in VPlan and can be used during VPlan simplification.
+ assert((DerivedIV != Index ||
+ getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
+ "IV didn't need transforming?");
State.set(this, DerivedIV, VPLane(0));
}
@@ -9897,6 +10058,164 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
!EnableLoopVectorization) {}
+/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
+/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
+/// don't have a corresponding wide induction in \p EpiPlan.
+static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
+ // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
+ // will need their resume-values computed in the main vector loop. Others
+ // can be removed from the main VPlan.
+ SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
+ for (VPRecipeBase &R :
+ EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ if (isa<VPCanonicalIVPHIRecipe>(&R))
+ continue;
+ EpiWidenedPhis.insert(
+ cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
+ }
+ for (VPRecipeBase &R : make_early_inc_range(
+ *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
+ auto *VPIRInst = cast<VPIRInstruction>(&R);
+ auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
+ if (!IRI)
+ break;
+ if (EpiWidenedPhis.contains(IRI))
+ continue;
+ // There is no corresponding wide induction in the epilogue plan that would
+ // need a resume value. Remove the VPIRInst wrapping the scalar header phi
+ // together with the corresponding ResumePhi. The resume values for the
+ // scalar loop will be created during execution of EpiPlan.
+ VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
+ VPIRInst->eraseFromParent();
+ ResumePhi->eraseFromParent();
+ }
+ VPlanTransforms::removeDeadRecipes(MainPlan);
+
+ using namespace VPlanPatternMatch;
+ VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
+ VPValue *VectorTC = &MainPlan.getVectorTripCount();
+ // If there is a suitable resume value for the canonical induction in the
+ // scalar (which will become vector) epilogue loop we are done. Otherwise
+ // create it below.
+ if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
+ return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
+ m_Specific(VectorTC), m_SpecificInt(0)));
+ }))
+ return;
+ VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
+ ScalarPHBuilder.createNaryOp(
+ VPInstruction::ResumePhi,
+ {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
+ "vec.epilog.resume.val");
+}
+
+/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
+/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
+static void
+preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
+ const SCEV2ValueTy &ExpandedSCEVs,
+ const EpilogueLoopVectorizationInfo &EPI) {
+ VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
+ VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
+ Header->setName("vec.epilog.vector.body");
+
+ // Re-use the trip count and steps expanded for the main loop, as
+ // skeleton creation needs it as a value that dominates both the scalar
+ // and vector epilogue loops
+ // TODO: This is a workaround needed for epilogue vectorization and it
+ // should be removed once induction resume value creation is done
+ // directly in VPlan.
+ for (auto &R : make_early_inc_range(*Plan.getEntry())) {
+ auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
+ if (!ExpandR)
+ continue;
+ auto *ExpandedVal =
+ Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
+ ExpandR->replaceAllUsesWith(ExpandedVal);
+ if (Plan.getTripCount() == ExpandR)
+ Plan.resetTripCount(ExpandedVal);
+ ExpandR->eraseFromParent();
+ }
+
+ // Ensure that the start values for all header phi recipes are updated before
+ // vectorizing the epilogue loop.
+ for (VPRecipeBase &R : Header->phis()) {
+ if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
+ // When vectorizing the epilogue loop, the canonical induction start
+ // value needs to be changed from zero to the value after the main
+ // vector loop. Find the resume value created during execution of the main
+ // VPlan.
+ // FIXME: Improve modeling for canonical IV start values in the epilogue
+ // loop.
+ BasicBlock *MainMiddle = find_singleton<BasicBlock>(
+ predecessors(L->getLoopPreheader()),
+ [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
+ if (BB != EPI.MainLoopIterationCountCheck &&
+ BB != EPI.EpilogueIterationCountCheck &&
+ BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
+ return BB;
+ return nullptr;
+ });
+ using namespace llvm::PatternMatch;
+ Type *IdxTy = IV->getScalarType();
+ PHINode *EPResumeVal = find_singleton<PHINode>(
+ L->getLoopPreheader()->phis(),
+ [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
+ if (P.getType() == IdxTy &&
+ P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
+ match(
+ P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
+ m_SpecificInt(0)))
+ return &P;
+ return nullptr;
+ });
+ assert(EPResumeVal && "must have a resume value for the canonical IV");
+ VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
+ assert(all_of(IV->users(),
+ [](const VPUser *U) {
+ return isa<VPScalarIVStepsRecipe>(U) ||
+ isa<VPScalarCastRecipe>(U) ||
+ isa<VPDerivedIVRecipe>(U) ||
+ cast<VPInstruction>(U)->getOpcode() ==
+ Instruction::Add;
+ }) &&
+ "the canonical IV should only be used by its increment or "
+ "ScalarIVSteps when resetting the start value");
+ IV->setOperand(0, VPV);
+ continue;
+ }
+
+ Value *ResumeV = nullptr;
+ // TODO: Move setting of resume values to prepareToExecute.
+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+ ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
+ ->getIncomingValueForBlock(L->getLoopPreheader());
+ const RecurrenceDescriptor &RdxDesc =
+ ReductionPhi->getRecurrenceDescriptor();
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+ // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
+ // start value; compare the final value from the main vector loop
+ // to the start value.
+ IRBuilder<> Builder(
+ cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
+ ResumeV =
+ Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
+ }
+ } else {
+ // Retrieve the induction resume values for wide inductions from
+ // their original phi nodes in the scalar loop.
+ PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
+ // Hook up to the PHINode generated by a ResumePhi recipe of main
+ // loop VPlan, which feeds the scalar loop.
+ ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
+ }
+ assert(ResumeV && "Must have a resume value");
+ VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
+ cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
+ }
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->isInnermost()) &&
"VPlan-native path is not enabled. Only process inner loops.");
@@ -9946,12 +10265,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (LVL.hasUncountableEarlyExit()) {
+ if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
- "early exit is not yet supported",
- "Auto-vectorization of loops with uncountable "
- "early exit is not yet supported",
- "UncountableEarlyExitLoopsUnsupported", ORE, L);
+ "early exit is not enabled",
+ "UncountableEarlyExitLoopsDisabled", ORE, L);
return false;
}
@@ -9977,6 +10294,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (UseInterleaved)
IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+ if (LVL.hasUncountableEarlyExit()) {
+ BasicBlock *LoopLatch = L->getLoopLatch();
+ if (IAI.requiresScalarEpilogue() ||
+ any_of(LVL.getCountableExitingBlocks(),
+ [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
+ reportVectorizationFailure("Auto-vectorization of early exit loops "
+ "requiring a scalar epilogue is unsupported",
+ "UncountableEarlyExitUnsupported", ORE, L);
+ return false;
+ }
+ }
+
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
ScalarEpilogueLowering SEL =
@@ -10243,11 +10572,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// The first pass vectorizes the main loop and creates a scalar epilogue
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
- EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+ VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+ preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
+ EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
+ BestEpiPlan);
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI, &LVL, &CM, BFI, PSI, Checks,
*BestMainPlan);
-
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, false);
++LoopsVectorized;
@@ -10256,84 +10587,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
EPI.MainLoopUF = EPI.EpilogueUF;
- VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
Checks, BestEpiPlan);
-
- VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
- VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
- Header->setName("vec.epilog.vector.body");
-
- // Re-use the trip count and steps expanded for the main loop, as
- // skeleton creation needs it as a value that dominates both the scalar
- // and vector epilogue loops
- // TODO: This is a workaround needed for epilogue vectorization and it
- // should be removed once induction resume value creation is done
- // directly in VPlan.
EpilogILV.setTripCount(MainILV.getTripCount());
- for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
- auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
- if (!ExpandR)
- continue;
- auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
- ExpandedSCEVs.find(ExpandR->getSCEV())->second);
- ExpandR->replaceAllUsesWith(ExpandedVal);
- if (BestEpiPlan.getTripCount() == ExpandR)
- BestEpiPlan.resetTripCount(ExpandedVal);
- ExpandR->eraseFromParent();
- }
-
- // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
- // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
- // before vectorizing the epilogue loop.
- for (VPRecipeBase &R : Header->phis()) {
- if (isa<VPCanonicalIVPHIRecipe>(&R))
- continue;
+ preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
- Value *ResumeV = nullptr;
- // TODO: Move setting of resume values to prepareToExecute.
- if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
- ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
- ->getIncomingValueForBlock(L->getLoopPreheader());
- const RecurrenceDescriptor &RdxDesc =
- ReductionPhi->getRecurrenceDescriptor();
- RecurKind RK = RdxDesc.getRecurrenceKind();
- if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
- // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
- // start value; compare the final value from the main vector loop
- // to the start value.
- IRBuilder<> Builder(
- cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
- ResumeV = Builder.CreateICmpNE(ResumeV,
- RdxDesc.getRecurrenceStartValue());
- }
- } else {
- // Create induction resume values for both widened pointer and
- // integer/fp inductions and update the start value of the induction
- // recipes to use the resume value.
- PHINode *IndPhi = nullptr;
- const InductionDescriptor *ID;
- if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
- IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
- ID = &Ind->getInductionDescriptor();
- } else {
- auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
- IndPhi = WidenInd->getPHINode();
- ID = &WidenInd->getInductionDescriptor();
- }
-
- ResumeV = MainILV.createInductionResumeValue(
- IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
- {EPI.MainLoopIterationCountCheck});
- }
- assert(ResumeV && "Must have a resume value");
- VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
- cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
- }
-
- assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
- "DT not preserved correctly");
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
@@ -10361,6 +10620,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
checkMixedPrecision(L, ORE);
}
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
+ "DT not preserved correctly");
+
std::optional<MDNode *> RemainderLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupEpilogue});
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 33657c26356d..f52ddfda5e64 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -344,6 +344,8 @@ static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
unsigned SVNumElements =
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
unsigned ShuffleMaskSize = SV->getShuffleMask().size();
+ if (SVNumElements % ShuffleMaskSize != 0)
+ return 0;
unsigned GroupSize = SVNumElements / ShuffleMaskSize;
if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
return 0;
@@ -514,7 +516,7 @@ static bool isCommutative(Instruction *I) {
BO->uses(),
[](const Use &U) {
// Commutative, if icmp eq/ne sub, 0
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (match(U.getUser(),
m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
(Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
@@ -700,7 +702,8 @@ static SmallBitVector isUndefVector(const Value *V,
/// TODO: Can we split off and reuse the shuffle mask detection from
/// ShuffleVectorInst/getShuffleCost?
static std::optional<TargetTransformInfo::ShuffleKind>
-isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
+isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+ AssumptionCache *AC) {
const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
if (It == VL.end())
return std::nullopt;
@@ -717,14 +720,14 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
- bool HasNonUndefVec = any_of(VL, [](Value *V) {
+ bool HasNonUndefVec = any_of(VL, [&](Value *V) {
auto *EE = dyn_cast<ExtractElementInst>(V);
if (!EE)
return false;
Value *Vec = EE->getVectorOperand();
if (isa<UndefValue>(Vec))
return false;
- return isGuaranteedNotToBePoison(Vec);
+ return isGuaranteedNotToBePoison(Vec, AC);
});
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
@@ -807,14 +810,16 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {
namespace {
/// Main data required for vectorization of instructions.
-struct InstructionsState {
- /// The very first instruction in the list with the main opcode.
- Value *OpValue = nullptr;
-
- /// The main/alternate instruction.
+class InstructionsState {
+ /// The main/alternate instruction. MainOp is also VL0.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
+public:
+ Instruction *getMainOp() const { return MainOp; }
+
+ Instruction *getAltOp() const { return AltOp; }
+
/// The main/alternate opcodes for the list of instructions.
unsigned getOpcode() const {
return MainOp ? MainOp->getOpcode() : 0;
@@ -833,9 +838,9 @@ struct InstructionsState {
}
InstructionsState() = delete;
- InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
- : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
- static InstructionsState invalid() { return {nullptr, nullptr, nullptr}; }
+ InstructionsState(Instruction *MainOp, Instruction *AltOp)
+ : MainOp(MainOp), AltOp(AltOp) {}
+ static InstructionsState invalid() { return {nullptr, nullptr}; }
};
} // end anonymous namespace
@@ -1073,7 +1078,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState::invalid();
}
- return InstructionsState(V, cast<Instruction>(V),
+ return InstructionsState(cast<Instruction>(V),
cast<Instruction>(VL[AltIndex]));
}
@@ -1087,7 +1092,8 @@ static bool allSameType(ArrayRef<Value *> VL) {
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
- TargetLibraryInfo *TLI) {
+ TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI) {
if (!UserInst)
return false;
unsigned Opcode = UserInst->getOpcode();
@@ -1104,7 +1110,7 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
return any_of(enumerate(CI->args()), [&](auto &&Arg) {
- return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
+ return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
Arg.value().get() == Scalar;
});
}
@@ -1842,12 +1848,12 @@ public:
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
if (S.getOpcode() &&
- (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
+ (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
!S.isAltShuffle()) &&
all_of(Ops, [&S](Value *V) {
return isa<PoisonValue>(V) ||
cast<Instruction>(V)->getNumOperands() ==
- S.MainOp->getNumOperands();
+ S.getMainOp()->getNumOperands();
}))
return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
: LookAheadHeuristics::ScoreSameOpcode;
@@ -2017,6 +2023,9 @@ public:
/// A vector of operand vectors.
SmallVector<OperandDataVec, 4> OpsVec;
+ /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
+ /// is not IntrinsicInst, ArgSize is User::getNumOperands.
+ unsigned ArgSize = 0;
const TargetLibraryInfo &TLI;
const DataLayout &DL;
@@ -2400,14 +2409,15 @@ public:
}
/// Go through the instructions in VL and append their operands.
- void appendOperandsOfVL(ArrayRef<Value *> VL) {
+ void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
assert(!VL.empty() && "Bad VL");
assert((empty() || VL.size() == getNumLanes()) &&
"Expected same number of lanes");
+ // IntrinsicInst::isCommutative returns true if swapping the first "two"
+ // arguments to the intrinsic produces the same result.
constexpr unsigned IntrinsicNumOperands = 2;
- auto *VL0 = cast<Instruction>(*find_if(VL, IsaPred<Instruction>));
- unsigned NumOperands = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands
- : VL0->getNumOperands();
+ unsigned NumOperands = VL0->getNumOperands();
+ ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2440,7 +2450,7 @@ public:
}
/// \returns the number of operands.
- unsigned getNumOperands() const { return OpsVec.size(); }
+ unsigned getNumOperands() const { return ArgSize; }
/// \returns the number of lanes.
unsigned getNumLanes() const { return OpsVec[0].size(); }
@@ -2460,6 +2470,8 @@ public:
/// the whole vector (it is mixed with constants or loop invariant values).
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+ assert(Op == getValue(OpIdx, Lane) &&
+ "Op is expected to be getValue(OpIdx, Lane).");
// Small number of loads - try load matching.
if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
return false;
@@ -2517,6 +2529,8 @@ public:
/// Checks if there is at least single compatible operand in lanes other
/// than \p Lane, compatible with the operand \p Op.
bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
+ assert(Op == getValue(OpIdx, Lane) &&
+ "Op is expected to be getValue(OpIdx, Lane).");
bool OpAPO = getData(OpIdx, Lane).APO;
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
if (Ln == Lane)
@@ -2537,13 +2551,11 @@ public:
public:
/// Initialize with all the operands of the instruction vector \p RootVL.
- VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
+ VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
: TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
- L(R.LI->getLoopFor(
- (cast<Instruction>(*find_if(RootVL, IsaPred<Instruction>))
- ->getParent()))) {
+ L(R.LI->getLoopFor((VL0->getParent()))) {
// Append all the operands of RootVL.
- appendOperandsOfVL(RootVL);
+ appendOperandsOfVL(RootVL, VL0);
}
/// \Returns a value vector with the operands across all lanes for the
@@ -2617,7 +2629,8 @@ public:
ArrayRef<OperandData> Op0 = OpsVec.front();
for (const OperandData &Data : Op0)
UniqueValues.insert(Data.V);
- for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
+ for (ArrayRef<OperandData> Op :
+ ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
if (any_of(Op, [&UniqueValues](const OperandData &Data) {
return !UniqueValues.contains(Data.V);
}))
@@ -2920,13 +2933,11 @@ private:
/// truncation. We collect the entries that will be demoted in ToDemote.
/// \param E Node for analysis
/// \param ToDemote indices of the nodes to be demoted.
- bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
- unsigned &BitWidth,
- SmallVectorImpl<unsigned> &ToDemote,
- DenseSet<const TreeEntry *> &Visited,
- unsigned &MaxDepthLevel,
- bool &IsProfitableToDemote,
- bool IsTruncRoot) const;
+ bool collectValuesToDemote(
+ const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+ SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
+ const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
+ bool &IsProfitableToDemote, bool IsTruncRoot) const;
/// Check if the operands on the edges \p Edges of the \p UserTE allows
/// reordering (i.e. the operands can be reordered because they have only one
@@ -3138,13 +3149,6 @@ private:
SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
8> &GatheredLoads);
- /// Reorder commutative or alt operands to get better probability of
- /// generating vectorized code.
- static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- const BoUpSLP &R);
-
/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
/// users of \p TE and collects the stores. It returns the map from the store
/// pointers to the collected stores.
@@ -3307,7 +3311,7 @@ private:
/// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
/// other nodes as a series of insertvector instructions.
- SmallVector<std::pair<unsigned, unsigned>, 0> CombinedEntriesWithIndices;
+ SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
private:
/// The operands of each instruction in each lane Operands[op_index][lane].
@@ -3339,27 +3343,13 @@ private:
copy(OpVL, Operands[OpIdx].begin());
}
- /// Set the operands of this bundle in their original order.
- void setOperandsInOrder() {
- assert(Operands.empty() && "Already initialized?");
- auto *I0 = cast<Instruction>(*find_if(Scalars, IsaPred<Instruction>));
- Operands.resize(I0->getNumOperands());
- unsigned NumLanes = Scalars.size();
- for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
- OpIdx != NumOperands; ++OpIdx) {
- Operands[OpIdx].resize(NumLanes);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- if (isa<PoisonValue>(Scalars[Lane])) {
- Operands[OpIdx][Lane] =
- PoisonValue::get(I0->getOperand(OpIdx)->getType());
- continue;
- }
- auto *I = cast<Instruction>(Scalars[Lane]);
- assert(I->getNumOperands() == NumOperands &&
- "Expected same number of operands");
- Operands[OpIdx][Lane] = I->getOperand(OpIdx);
- }
- }
+ /// Set this bundle's operand from Scalars.
+ void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
+ VLOperands Ops(Scalars, MainOp, R);
+ if (RequireReorder)
+ Ops.reorder();
+ for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
+ setOperand(I, Ops.getVL(I));
}
/// Reorders operands of the node to the given mask \p Mask.
@@ -3410,8 +3400,8 @@ private:
}
void setOperations(const InstructionsState &S) {
- MainOp = S.MainOp;
- AltOp = S.AltOp;
+ MainOp = S.getMainOp();
+ AltOp = S.getAltOp();
}
Instruction *getMainOp() const {
@@ -3555,6 +3545,13 @@ private:
for (const auto &EInfo : UserTreeIndices)
dbgs() << EInfo << ", ";
dbgs() << "\n";
+ if (!CombinedEntriesWithIndices.empty()) {
+ dbgs() << "Combined entries: ";
+ interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
+ dbgs() << "Entry index " << P.first << " with offset " << P.second;
+ });
+ dbgs() << "\n";
+ }
}
#endif
};
@@ -3649,8 +3646,8 @@ private:
}
// Update the scheduler bundle to point to this TreeEntry.
ScheduleData *BundleMember = *Bundle;
- assert((BundleMember || isa<PHINode>(S.MainOp) ||
- isVectorLikeInstWithConstOps(S.MainOp) ||
+ assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
+ isVectorLikeInstWithConstOps(S.getMainOp()) ||
doesNotNeedToSchedule(VL)) &&
"Bundle and VL out of sync");
if (BundleMember) {
@@ -3717,9 +3714,11 @@ private:
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
- TreeEntry::EntryState getScalarsVectorizationState(
- InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps);
+ TreeEntry::EntryState
+ getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
+ bool IsScatterVectorizeUserTE,
+ OrdersType &CurrentOrder,
+ SmallVectorImpl<Value *> &PointerOps);
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
@@ -4790,8 +4789,10 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) {
/// Check if \p Order represents reverse order.
static bool isReverseOrder(ArrayRef<unsigned> Order) {
+ assert(!Order.empty() &&
+ "Order is empty. Please check it before using isReverseOrder.");
unsigned Sz = Order.size();
- return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
+ return all_of(enumerate(Order), [&](const auto &Pair) {
return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
});
}
@@ -5642,8 +5643,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
auto PHICompare = [&](unsigned I1, unsigned I2) {
Value *V1 = TE.Scalars[I1];
Value *V2 = TE.Scalars[I2];
- if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0) ||
- isa<PoisonValue>(V1) || isa<PoisonValue>(V2))
+ if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
+ return false;
+ if (isa<PoisonValue>(V1))
+ return true;
+ if (isa<PoisonValue>(V2))
return false;
if (V1->getNumUses() < V2->getNumUses())
return true;
@@ -6511,7 +6515,7 @@ void BoUpSLP::buildExternalUses(
// be used.
if (UseEntry->State == TreeEntry::ScatterVectorize ||
!doesInTreeUserNeedToExtract(
- Scalar, getRootEntryInstruction(*UseEntry), TLI)) {
+ Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
assert(!UseEntry->isGather() && "Bad state");
@@ -6935,8 +6939,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// 2. All users are deleted.
// 3. The load broadcasts are not allowed or the load is not
// broadcasted.
- if (std::distance(LI->user_begin(), LI->user_end()) !=
- LI->getNumUses())
+ if (static_cast<unsigned int>(std::distance(
+ LI->user_begin(), LI->user_end())) != LI->getNumUses())
return false;
if (!IsLegalBroadcastLoad)
continue;
@@ -7426,17 +7430,17 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
unsigned Opcode1 = S.getAltOpcode();
SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
// If this pattern is supported by the target then consider it profitable.
- if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()),
+ if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
Opcode0, Opcode1, OpcodeMask))
return true;
SmallVector<ValueList> Operands;
- for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
+ for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
Operands.emplace_back();
// Prepare the operand vector.
for (Value *V : VL) {
if (isa<PoisonValue>(V)) {
Operands.back().push_back(
- PoisonValue::get(S.MainOp->getOperand(I)->getType()));
+ PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
continue;
}
Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
@@ -7486,7 +7490,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
++ExtraShuffleInsts;
}
}
- const Loop *L = LI->getLoopFor(S.MainOp->getParent());
+ const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
// Vectorize node, if:
// 1. at least single operand is constant or splat.
// 2. Operands have many loop invariants (the instructions are not loop
@@ -7496,7 +7500,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
[&](ArrayRef<Value *> Op) {
if (allConstant(Op) ||
(!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
- getSameOpcode(Op, *TLI).MainOp))
+ getSameOpcode(Op, *TLI).getMainOp()))
return false;
DenseMap<Value *, unsigned> Uniques;
for (Value *V : Op) {
@@ -7528,19 +7532,21 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
// vector operands is number of vector instructions + number of vector
// instructions for operands (buildvectors). Number of buildvector
// instructions is just number_of_operands * number_of_scalars.
- (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
+ (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
(UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
- NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
+ NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
}
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
- InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) {
- assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
+ const InstructionsState &S, ArrayRef<Value *> VL,
+ bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+ SmallVectorImpl<Value *> &PointerOps) {
+ assert(S.getMainOp() &&
+ "Expected instructions with same/alternate opcodes only.");
unsigned ShuffleOrOp =
S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
- auto *VL0 = cast<Instruction>(S.OpValue);
+ Instruction *VL0 = S.getMainOp();
switch (ShuffleOrOp) {
case Instruction::PHI: {
// Too many operands - gather, most probably won't be vectorized.
@@ -7712,7 +7718,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::Or:
case Instruction::Xor:
case Instruction::Freeze:
- if (S.MainOp->getType()->isFloatingPointTy() &&
+ if (S.getMainOp()->getType()->isFloatingPointTy() &&
TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
auto *I = dyn_cast<Instruction>(V);
return I && I->isBinaryOp() && !I->isFast();
@@ -7809,7 +7815,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::NeedToGather;
}
case Instruction::Call: {
- if (S.MainOp->getType()->isFloatingPointTy() &&
+ if (S.getMainOp()->getType()->isFloatingPointTy() &&
TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
auto *I = dyn_cast<Instruction>(V);
return I && !I->isFast();
@@ -7834,7 +7840,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
unsigned NumArgs = CI->arg_size();
SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned J = 0; J != NumArgs; ++J)
- if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI))
ScalarArgs[J] = CI->getArgOperand(J);
for (Value *V : VL) {
CallInst *CI2 = dyn_cast<CallInst>(V);
@@ -7850,7 +7856,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
// Some intrinsics have scalar arguments and should be same in order for
// them to be vectorized.
for (unsigned J = 0; J != NumArgs; ++J) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) {
Value *A1J = CI2->getArgOperand(J);
if (ScalarArgs[J] != A1J) {
LLVM_DEBUG(dbgs()
@@ -8035,7 +8041,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return isa<UndefValue>(V) || !isConstant(V);
}))) {
if (DoNotFail && UniquePositions.size() > 1 &&
- NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
+ NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
// Find the number of elements, which forms full vectors.
unsigned PWSz = getFullVectorNumberOfElements(
@@ -8065,8 +8071,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't go into catchswitch blocks, which can happen with PHIs.
// Such blocks can only have PHIs and the catchswitch. There is no
// place to insert a shuffle if we need to, so just avoid that issue.
- if (S.MainOp &&
- isa<CatchSwitchInst>(S.MainOp->getParent()->getTerminator())) {
+ if (S.getMainOp() &&
+ isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return;
@@ -8074,10 +8080,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check if this is a duplicate of another entry.
if (S.getOpcode()) {
- if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+ if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
+ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
+ << ".\n");
if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
- auto It = MultiNodeScalars.find(S.OpValue);
+ auto It = MultiNodeScalars.find(S.getMainOp());
if (It != MultiNodeScalars.end()) {
auto *TEIt = find_if(It->getSecond(),
[&](TreeEntry *ME) { return ME->isSame(VL); });
@@ -8090,7 +8097,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
if (!E) {
- if (!doesNotNeedToBeScheduled(S.OpValue)) {
+ if (!doesNotNeedToBeScheduled(S.getMainOp())) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
@@ -8098,8 +8105,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
SmallPtrSet<const TreeEntry *, 4> Nodes;
- Nodes.insert(getTreeEntry(S.OpValue));
- for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
+ Nodes.insert(getTreeEntry(S.getMainOp()));
+ for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
Nodes.insert(E);
SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
if (any_of(Nodes, [&](const TreeEntry *E) {
@@ -8122,7 +8129,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// used to properly draw the graph rather than for the actual
// vectorization.
E->UserTreeIndices.push_back(UserTreeIdx);
- LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
<< ".\n");
return;
}
@@ -8133,13 +8140,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// a load), in which case peek through to include it in the tree, without
// ballooning over-budget.
if (Depth >= RecursionMaxDepth &&
- !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
- VL.size() >= 4 &&
- (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
+ !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 &&
+ (match(S.getMainOp(), m_Load(m_Value())) ||
+ all_of(VL, [&S](const Value *I) {
return match(I,
m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
cast<Instruction>(I)->getOpcode() ==
- cast<Instruction>(S.MainOp)->getOpcode();
+ S.getMainOp()->getOpcode();
})))) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
if (TryToFindDuplicates(S))
@@ -8151,7 +8158,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't handle scalable vectors
if (S.getOpcode() == Instruction::ExtractElement &&
isa<ScalableVectorType>(
- cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
+ cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
@@ -8188,7 +8195,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
}));
}
- bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
+ bool IsCommutative =
+ isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
if ((IsCommutative &&
std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
(!IsCommutative &&
@@ -8198,20 +8206,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
auto *I1 = cast<Instruction>(VL.front());
auto *I2 = cast<Instruction>(VL.back());
- for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
I2->getOperand(Op));
if (static_cast<unsigned>(count_if(
Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
- })) >= S.MainOp->getNumOperands() / 2)
+ })) >= S.getMainOp()->getNumOperands() / 2)
return false;
- if (S.MainOp->getNumOperands() > 2)
+ if (S.getMainOp()->getNumOperands() > 2)
return true;
if (IsCommutative) {
// Check permuted operands.
Candidates.clear();
- for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
I2->getOperand((Op + 1) % E));
if (any_of(
@@ -8246,7 +8254,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) ||
(isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>(
- S.OpValue) &&
+ S.getMainOp()) &&
!all_of(VL, isVectorLikeInstWithConstOps)) ||
NotProfitableForVectorization(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
@@ -8313,10 +8321,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
- auto *VL0 = cast<Instruction>(S.OpValue);
+ Instruction *VL0 = S.getMainOp();
BB = VL0->getParent();
- if (S.MainOp &&
+ if (S.getMainOp() &&
(BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
!DT->isReachableFromEntry(BB))) {
// Don't go into unreachable blocks. They may contain instructions with
@@ -8394,7 +8402,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry *TE =
newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
+ TE->dump());
// Keeps the reordered operands to avoid code duplication.
PHIHandler Handler(*DT, PH, VL);
@@ -8423,13 +8432,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// Insert new order with initial value 0, if it does not exist,
// otherwise return the iterator to the existing one.
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices, CurrentOrder);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices, CurrentOrder);
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
+ "(ExtractValueInst/ExtractElementInst).\n";
+ TE->dump());
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
- ValueList Op0;
- Op0.assign(VL.size(), VL0->getOperand(0));
- VectorizableTree.back()->setOperand(0, Op0);
+ TE->setOperand(*this);
return;
}
case Instruction::InsertElement: {
@@ -8457,9 +8467,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CurrentOrder.clear();
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
{}, CurrentOrder);
- LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
+ TE->dump());
- TE->setOperandsInOrder();
+ TE->setOperand(*this);
buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
return;
}
@@ -8477,30 +8488,36 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
if (CurrentOrder.empty())
- LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
+ TE->dump());
else
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
- TE->setOperandsInOrder();
+ LLVM_DEBUG(dbgs()
+ << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
+ TE->dump());
break;
case TreeEntry::StridedVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
- TE->setOperandsInOrder();
- LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
+ TE->dump());
break;
case TreeEntry::ScatterVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndices);
- TE->setOperandsInOrder();
- buildTree_rec(PointerOps, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
+ TE->dump());
break;
case TreeEntry::CombinedVectorize:
case TreeEntry::NeedToGather:
llvm_unreachable("Unexpected loads state.");
}
+ TE->setOperand(*this);
+ if (State == TreeEntry::ScatterVectorize)
+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});
return;
}
case Instruction::ZExt:
@@ -8536,10 +8553,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
+ TE->dump());
- TE->setOperandsInOrder();
- for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+ TE->setOperand(*this);
+ for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
if (ShuffleOrOp == Instruction::Trunc) {
ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
@@ -8563,15 +8581,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
+ TE->dump());
ValueList Left, Right;
+ VLOperands Ops(VL, VL0, *this);
if (cast<CmpInst>(VL0)->isCommutative()) {
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
assert(P0 == CmpInst::getSwappedPredicate(P0) &&
"Commutative Predicate mismatch");
- reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+ Ops.reorder();
+ Left = Ops.getVL(0);
+ Right = Ops.getVL(1);
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
@@ -8630,29 +8652,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::Freeze: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
-
- // Sort operands of the instructions so that each side is more likely to
- // have the same opcode.
- if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
- ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *this);
- TE->setOperand(0, Left);
- TE->setOperand(1, Right);
- buildTree_rec(Left, Depth + 1, {TE, 0});
- buildTree_rec(Right, Depth + 1, {TE, 1});
- return;
- }
+ LLVM_DEBUG(
+ dbgs() << "SLP: added a new TreeEntry "
+ "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
+ TE->dump());
- TE->setOperandsInOrder();
- for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+ TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
+ for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
return;
}
case Instruction::GetElementPtr: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
+ TE->dump());
SmallVector<ValueList, 2> Operands(2);
// Prepare the operand vector for pointer operands.
for (Value *V : VL) {
@@ -8710,12 +8724,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
fixupOrderingIndices(CurrentOrder);
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices, CurrentOrder);
- TE->setOperandsInOrder();
- buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
if (Consecutive)
- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
+ TE->dump());
else
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+ LLVM_DEBUG(
+ dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
+ TE->dump());
+ TE->setOperand(*this);
+ buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
return;
}
case Instruction::Call: {
@@ -8726,93 +8743,64 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- // Sort operands of the instructions so that each side is more likely to
- // have the same opcode.
- if (isCommutative(VL0)) {
- ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *this);
- TE->setOperand(0, Left);
- TE->setOperand(1, Right);
- SmallVector<ValueList> Operands;
- for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
- Operands.emplace_back();
- if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
- continue;
- for (Value *V : VL) {
- auto *CI2 = cast<CallInst>(V);
- Operands.back().push_back(CI2->getArgOperand(I));
- }
- TE->setOperand(I, Operands.back());
- }
- buildTree_rec(Left, Depth + 1, {TE, 0});
- buildTree_rec(Right, Depth + 1, {TE, 1});
- for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
- if (Operands[I - 2].empty())
- continue;
- buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
- }
- return;
- }
- TE->setOperandsInOrder();
- for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
+ TE->dump());
+ TE->setOperand(*this, isCommutative(VL0));
+ for (unsigned I : seq<unsigned>(CI->arg_size())) {
// For scalar operands no need to create an entry since no need to
// vectorize it.
- if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI))
continue;
- ValueList Operands;
- // Prepare the operand vector.
- for (Value *V : VL) {
- auto *CI2 = cast<CallInst>(V);
- Operands.push_back(CI2->getArgOperand(I));
- }
- buildTree_rec(Operands, Depth + 1, {TE, I});
+ buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
}
return;
}
case Instruction::ShuffleVector: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+ if (S.isAltShuffle()) {
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
+ TE->dump());
+ } else {
+ assert(SLPReVec && "Only supported by REVEC.");
+ LLVM_DEBUG(
+ dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
+ TE->dump());
+ }
// Reorder operands if reordering would enable vectorization.
auto *CI = dyn_cast<CmpInst>(VL0);
- if (isa<BinaryOperator>(VL0) || CI) {
+ if (CI && any_of(VL, [](Value *V) {
+ return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
+ })) {
+ auto *MainCI = cast<CmpInst>(S.getMainOp());
+ auto *AltCI = cast<CmpInst>(S.getAltOp());
+ CmpInst::Predicate MainP = MainCI->getPredicate();
+ CmpInst::Predicate AltP = AltCI->getPredicate();
+ assert(MainP != AltP &&
+ "Expected different main/alternate predicates.");
ValueList Left, Right;
- if (!CI || all_of(VL, [](Value *V) {
- return isa<PoisonValue>(V) || cast<CmpInst>(V)->isCommutative();
- })) {
- reorderInputsAccordingToOpcode(VL, Left, Right, *this);
- } else {
- auto *MainCI = cast<CmpInst>(S.MainOp);
- auto *AltCI = cast<CmpInst>(S.AltOp);
- CmpInst::Predicate MainP = MainCI->getPredicate();
- CmpInst::Predicate AltP = AltCI->getPredicate();
- assert(MainP != AltP &&
- "Expected different main/alternate predicates.");
- // Collect operands - commute if it uses the swapped predicate or
- // alternate operation.
- for (Value *V : VL) {
- if (isa<PoisonValue>(V)) {
- Left.push_back(
- PoisonValue::get(MainCI->getOperand(0)->getType()));
- Right.push_back(
- PoisonValue::get(MainCI->getOperand(1)->getType()));
- continue;
- }
- auto *Cmp = cast<CmpInst>(V);
- Value *LHS = Cmp->getOperand(0);
- Value *RHS = Cmp->getOperand(1);
+ // Collect operands - commute if it uses the swapped predicate or
+ // alternate operation.
+ for (Value *V : VL) {
+ if (isa<PoisonValue>(V)) {
+ Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
+ Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
+ continue;
+ }
+ auto *Cmp = cast<CmpInst>(V);
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
- if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
- if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
- std::swap(LHS, RHS);
- } else {
- if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
- std::swap(LHS, RHS);
- }
- Left.push_back(LHS);
- Right.push_back(RHS);
+ if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
+ if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
+ std::swap(LHS, RHS);
+ } else {
+ if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
+ std::swap(LHS, RHS);
}
+ Left.push_back(LHS);
+ Right.push_back(RHS);
}
TE->setOperand(0, Left);
TE->setOperand(1, Right);
@@ -8821,8 +8809,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
- TE->setOperandsInOrder();
- for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+ TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
+ for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
return;
}
@@ -9707,7 +9695,7 @@ void BoUpSLP::transformNodes() {
auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
const InstructionsState &S) {
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
- for (unsigned Op : seq<unsigned>(S.MainOp->getNumOperands()))
+ for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
I2->getOperand(Op));
return all_of(
@@ -9778,7 +9766,8 @@ void BoUpSLP::transformNodes() {
Slice.front()->getType(), 2 * VF)),
1U, 2 * VF)) ||
count(Slice, Slice.front()) ==
- (isa<UndefValue>(Slice.front()) ? VF - 1 : 1)) {
+ static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
+ : 1)) {
if (IsSplat)
continue;
InstructionsState S = getSameOpcode(Slice, *TLI);
@@ -9791,7 +9780,7 @@ void BoUpSLP::transformNodes() {
// Try to vectorize reduced values or if all users are vectorized.
// For expensive instructions extra extracts might be profitable.
if ((!UserIgnoreList || E.Idx != 0) &&
- TTI->getInstructionCost(S.MainOp, CostKind) <
+ TTI->getInstructionCost(S.getMainOp(), CostKind) <
TTI::TCC_Expensive &&
!all_of(Slice, [&](Value *V) {
if (isa<PoisonValue>(V))
@@ -9818,10 +9807,10 @@ void BoUpSLP::transformNodes() {
continue;
}
} else if (S.getOpcode() == Instruction::ExtractElement ||
- (TTI->getInstructionCost(S.MainOp, CostKind) <
+ (TTI->getInstructionCost(S.getMainOp(), CostKind) <
TTI::TCC_Expensive &&
!CheckOperandsProfitability(
- S.MainOp,
+ S.getMainOp(),
cast<Instruction>(*find_if(reverse(Slice),
IsaPred<Instruction>)),
S))) {
@@ -9891,7 +9880,7 @@ void BoUpSLP::transformNodes() {
Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
// Check if profitable to represent consecutive load + reverse as strided
// load with stride -1.
- if (isReverseOrder(E.ReorderIndices) &&
+ if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
SmallVector<int> Mask;
inversePermutation(E.ReorderIndices, Mask);
@@ -9918,7 +9907,7 @@ void BoUpSLP::transformNodes() {
Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
// Check if profitable to represent consecutive load + reverse as strided
// load with stride -1.
- if (isReverseOrder(E.ReorderIndices) &&
+ if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
SmallVector<int> Mask;
inversePermutation(E.ReorderIndices, Mask);
@@ -10272,9 +10261,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
// sub-Mask into the CommonMask to estimate it later and avoid double cost
// estimation.
if ((InVectors.size() == 2 &&
- InVectors.front().get<const TreeEntry *>() == &E1 &&
- InVectors.back().get<const TreeEntry *>() == E2) ||
- (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
+ cast<const TreeEntry *>(InVectors.front()) == &E1 &&
+ cast<const TreeEntry *>(InVectors.back()) == E2) ||
+ (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
[](int Idx) { return Idx == PoisonMaskElem; }) &&
@@ -10300,7 +10289,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
VF = std::max(VF,
cast<FixedVectorType>(V1->getType())->getNumElements());
} else {
- const auto *E = InVectors.front().get<const TreeEntry *>();
+ const auto *E = cast<const TreeEntry *>(InVectors.front());
VF = std::max(VF, E->getVectorFactor());
}
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
@@ -10316,7 +10305,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
VF = std::max(VF,
getNumElements(V1->getType()));
} else {
- const auto *E = P.get<const TreeEntry *>();
+ const auto *E = cast<const TreeEntry *>(P);
VF = std::max(VF, E->getVectorFactor());
}
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
@@ -10422,9 +10411,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
};
if (!V1 && !V2 && !P2.isNull()) {
// Shuffle 2 entry nodes.
- const TreeEntry *E = P1.get<const TreeEntry *>();
+ const TreeEntry *E = cast<const TreeEntry *>(P1);
unsigned VF = E->getVectorFactor();
- const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ const TreeEntry *E2 = cast<const TreeEntry *>(P2);
CommonVF = std::max(VF, E2->getVectorFactor());
assert(all_of(Mask,
[=](int Idx) {
@@ -10456,7 +10445,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
} else if (!V1 && P2.isNull()) {
// Shuffle single entry node.
- const TreeEntry *E = P1.get<const TreeEntry *>();
+ const TreeEntry *E = cast<const TreeEntry *>(P1);
unsigned VF = E->getVectorFactor();
CommonVF = VF;
assert(
@@ -10505,7 +10494,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
} else if (V1 && !V2) {
// Shuffle vector and tree node.
unsigned VF = getVF(V1);
- const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ const TreeEntry *E2 = cast<const TreeEntry *>(P2);
CommonVF = std::max(VF, E2->getVectorFactor());
assert(all_of(Mask,
[=](int Idx) {
@@ -10531,7 +10520,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
} else if (!V1 && V2) {
// Shuffle vector and tree node.
unsigned VF = getVF(V2);
- const TreeEntry *E1 = P1.get<const TreeEntry *>();
+ const TreeEntry *E1 = cast<const TreeEntry *>(P1);
CommonVF = std::max(VF, E1->getVectorFactor());
assert(all_of(Mask,
[=](int Idx) {
@@ -10769,8 +10758,8 @@ public:
if (P.value() == PoisonMaskElem)
return Mask[P.index()] == PoisonMaskElem;
auto *EI = cast<ExtractElementInst>(
- InVectors.front().get<const TreeEntry *>()->getOrdered(
- P.index()));
+ cast<const TreeEntry *>(InVectors.front())
+ ->getOrdered(P.index()));
return EI->getVectorOperand() == V1 ||
EI->getVectorOperand() == V2;
}) &&
@@ -10787,23 +10776,21 @@ public:
}
if (ForExtracts) {
// No need to add vectors here, already handled them in adjustExtracts.
- assert(
- InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
- !CommonMask.empty() &&
- all_of(enumerate(CommonMask),
- [&](auto P) {
- Value *Scalar =
- InVectors.front().get<const TreeEntry *>()->getOrdered(
- P.index());
- if (P.value() == PoisonMaskElem)
- return P.value() == Mask[P.index()] ||
- isa<UndefValue>(Scalar);
- if (isa<Constant>(V1))
- return true;
- auto *EI = cast<ExtractElementInst>(Scalar);
- return EI->getVectorOperand() == V1;
- }) &&
- "Expected only tree entry for extractelement vectors.");
+ assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
+ !CommonMask.empty() &&
+ all_of(enumerate(CommonMask),
+ [&](auto P) {
+ Value *Scalar = cast<const TreeEntry *>(InVectors[0])
+ ->getOrdered(P.index());
+ if (P.value() == PoisonMaskElem)
+ return P.value() == Mask[P.index()] ||
+ isa<UndefValue>(Scalar);
+ if (isa<Constant>(V1))
+ return true;
+ auto *EI = cast<ExtractElementInst>(Scalar);
+ return EI->getVectorOperand() == V1;
+ }) &&
+ "Expected only tree entry for extractelement vectors.");
return;
}
assert(!InVectors.empty() && !CommonMask.empty() &&
@@ -10818,7 +10805,7 @@ public:
VF = std::max(VF, InTE->getVectorFactor());
} else {
VF = std::max(
- VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
+ VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
->getNumElements());
}
InVectors.push_back(V1);
@@ -10888,7 +10875,7 @@ public:
CommonMask[Idx] = Idx;
assert(VF > 0 &&
"Expected vector length for the final value before action.");
- Value *V = Vec.get<Value *>();
+ Value *V = cast<Value *>(Vec);
Action(V, CommonMask);
InVectors.front() = V;
}
@@ -10998,14 +10985,14 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
/// Builds the arguments types vector for the given call instruction with the
/// given \p ID for the specified vector factor.
-static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
- const Intrinsic::ID ID,
- const unsigned VF,
- unsigned MinBW) {
+static SmallVector<Type *>
+buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
+ const unsigned VF, unsigned MinBW,
+ const TargetTransformInfo *TTI) {
SmallVector<Type *> ArgTys;
for (auto [Idx, Arg] : enumerate(CI->args())) {
if (ID != Intrinsic::not_intrinsic) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
ArgTys.push_back(Arg->getType());
continue;
}
@@ -11044,7 +11031,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
unsigned EntryVF = E->getVectorFactor();
auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
- bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->isGather()) {
if (allConstant(VL))
return 0;
@@ -11057,9 +11043,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
- bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
- if (!E->ReorderIndices.empty() &&
- (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
+ if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
+ !isReverseOrder(E->ReorderIndices))) {
SmallVector<int> NewMask;
if (E->getOpcode() == Instruction::Store) {
// For stores the order is actually a mask.
@@ -11070,7 +11055,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
::addMask(Mask, NewMask);
}
- if (NeedToShuffleReuses)
+ if (!E->ReuseShuffleIndices.empty())
::addMask(Mask, E->ReuseShuffleIndices);
if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
CommonCost =
@@ -11458,7 +11443,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
- CmpInst::Predicate VecPred, SwappedVecPred;
+ CmpPredicate VecPred, SwappedVecPred;
auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
match(VL0, MatchCmp))
@@ -11472,13 +11457,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return InstructionCost(TTI::TCC_Free);
auto *VI = cast<Instruction>(UniqueValues[Idx]);
- CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
- ? CmpInst::BAD_FCMP_PREDICATE
- : CmpInst::BAD_ICMP_PREDICATE;
+ CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
+ ? CmpInst::BAD_FCMP_PREDICATE
+ : CmpInst::BAD_ICMP_PREDICATE;
auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
+ // FIXME: Use CmpPredicate::getMatching here.
if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
!match(VI, MatchCmp)) ||
- (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
+ (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
+ CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
@@ -11707,9 +11694,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto GetVectorCost = [=](InstructionCost CommonCost) {
auto *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- SmallVector<Type *> ArgTys =
- buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
- It != MinBWs.end() ? It->second.first : 0);
+ SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
+ CI, ID, VecTy->getNumElements(),
+ It != MinBWs.end() ? It->second.first : 0, TTI);
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
};
@@ -11894,7 +11881,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
TE->Scalars.size() < Limit ||
((TE->getOpcode() == Instruction::ExtractElement ||
all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
- isFixedVectorShuffle(TE->Scalars, Mask)) ||
+ isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
(TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
any_of(TE->Scalars, IsaPred<LoadInst>));
};
@@ -12959,7 +12946,7 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements(
// Check that gather of extractelements can be represented as just a
// shuffle of a single/two vectors the scalars are extracted from.
std::optional<TTI::ShuffleKind> Res =
- isFixedVectorShuffle(GatheredExtracts, Mask);
+ isFixedVectorShuffle(GatheredExtracts, Mask, AC);
if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
// TODO: try to check other subsets if possible.
// Restore the original VL if attempt was not successful.
@@ -13209,14 +13196,15 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
copy(CommonMask, Mask.begin());
}
// Clear undef scalars.
- for (int I = 0, Sz = VL.size(); I < Sz; ++I)
+ for (unsigned I : seq<unsigned>(VL.size()))
if (isa<PoisonValue>(VL[I]))
- Mask[I] = PoisonMaskElem;
+ Mask[Part * VL.size() + I] = PoisonMaskElem;
return TargetTransformInfo::SK_PermuteSingleSrc;
}
// No perfect match, just shuffle, so choose the first tree node from the
// tree.
Entries.push_back(FirstEntries.front());
+ VF = FirstEntries.front()->getVectorFactor();
} else {
// Try to find nodes with the same vector factor.
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
@@ -13257,6 +13245,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
Entries.push_back(SecondEntries.front());
VF = std::max(Entries.front()->getVectorFactor(),
Entries.back()->getVectorFactor());
+ } else {
+ VF = Entries.front()->getVectorFactor();
}
}
@@ -13368,17 +13358,141 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
: Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
IsIdentity &= Mask[Idx] == Pair.second;
}
- switch (Entries.size()) {
- case 1:
- if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
- return TargetTransformInfo::SK_PermuteSingleSrc;
- break;
- case 2:
- if (EntryLanes.size() > 2 || VL.size() <= 2)
- return TargetTransformInfo::SK_PermuteTwoSrc;
- break;
- default:
- break;
+ if (ForOrder || IsIdentity || Entries.empty()) {
+ switch (Entries.size()) {
+ case 1:
+ if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
+ return TargetTransformInfo::SK_PermuteSingleSrc;
+ break;
+ case 2:
+ if (EntryLanes.size() > 2 || VL.size() <= 2)
+ return TargetTransformInfo::SK_PermuteTwoSrc;
+ break;
+ default:
+ break;
+ }
+ } else if (!isa<VectorType>(VL.front()->getType()) &&
+ (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
+ // Do the cost estimation if shuffle beneficial than buildvector.
+ SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
+ std::next(Mask.begin(), (Part + 1) * VL.size()));
+ int MinElement = SubMask.front(), MaxElement = SubMask.front();
+ for (int Idx : SubMask) {
+ if (Idx == PoisonMaskElem)
+ continue;
+ if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
+ MinElement = Idx;
+ if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
+ MaxElement = Idx;
+ }
+ assert(MaxElement >= 0 && MinElement >= 0 &&
+ MaxElement % VF >= MinElement % VF &&
+ "Expected at least single element.");
+ unsigned NewVF = std::max<unsigned>(
+ VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
+ (MaxElement % VF) -
+ (MinElement % VF) + 1));
+ if (NewVF < VF) {
+ for_each(SubMask, [&](int &Idx) {
+ if (Idx == PoisonMaskElem)
+ return;
+ Idx = (Idx % VF) - (MinElement % VF) +
+ (Idx >= static_cast<int>(VF) ? NewVF : 0);
+ });
+ VF = NewVF;
+ }
+
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ auto *VecTy = getWidenedType(VL.front()->getType(), VF);
+ auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
+ auto GetShuffleCost = [&,
+ &TTI = *TTI](ArrayRef<int> Mask,
+ ArrayRef<const TreeEntry *> Entries,
+ VectorType *VecTy) -> InstructionCost {
+ if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(
+ Mask, Entries.front()->getInterleaveFactor()))
+ return TTI::TCC_Free;
+ return ::getShuffleCost(TTI,
+ Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
+ : TTI::SK_PermuteSingleSrc,
+ VecTy, Mask, CostKind);
+ };
+ InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
+ InstructionCost FirstShuffleCost = 0;
+ SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
+ if (Entries.size() == 1 || !Entries[0]->isGather()) {
+ FirstShuffleCost = ShuffleCost;
+ } else {
+ // Transform mask to include only first entry.
+ APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+ bool IsIdentity = true;
+ for (auto [I, Idx] : enumerate(FirstMask)) {
+ if (Idx >= static_cast<int>(VF)) {
+ Idx = PoisonMaskElem;
+ } else {
+ DemandedElts.clearBit(I);
+ if (Idx != PoisonMaskElem)
+ IsIdentity &= static_cast<int>(I) == Idx;
+ }
+ }
+ if (!IsIdentity)
+ FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
+ FirstShuffleCost += TTI->getScalarizationOverhead(
+ MaskVecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind);
+ }
+ InstructionCost SecondShuffleCost = 0;
+ SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
+ if (Entries.size() == 1 || !Entries[1]->isGather()) {
+ SecondShuffleCost = ShuffleCost;
+ } else {
+ // Transform mask to include only first entry.
+ APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+ bool IsIdentity = true;
+ for (auto [I, Idx] : enumerate(SecondMask)) {
+ if (Idx < static_cast<int>(VF) && Idx >= 0) {
+ Idx = PoisonMaskElem;
+ } else {
+ DemandedElts.clearBit(I);
+ if (Idx != PoisonMaskElem) {
+ Idx -= VF;
+ IsIdentity &= static_cast<int>(I) == Idx;
+ }
+ }
+ }
+ if (!IsIdentity)
+ SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
+ SecondShuffleCost += TTI->getScalarizationOverhead(
+ MaskVecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind);
+ }
+ APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+ for (auto [I, Idx] : enumerate(SubMask))
+ if (Idx == PoisonMaskElem)
+ DemandedElts.clearBit(I);
+ InstructionCost BuildVectorCost =
+ TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind);
+ const TreeEntry *BestEntry = nullptr;
+ if (FirstShuffleCost < ShuffleCost) {
+ copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
+ BestEntry = Entries.front();
+ ShuffleCost = FirstShuffleCost;
+ }
+ if (SecondShuffleCost < ShuffleCost) {
+ copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
+ BestEntry = Entries[1];
+ ShuffleCost = SecondShuffleCost;
+ }
+ if (BuildVectorCost >= ShuffleCost) {
+ if (BestEntry) {
+ Entries.clear();
+ Entries.push_back(BestEntry);
+ }
+ return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
+ : TargetTransformInfo::SK_PermuteSingleSrc;
+ }
}
Entries.clear();
// Clear the corresponding mask elements.
@@ -13526,21 +13640,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
return Cost;
}
-// Perform operand reordering on the instructions in VL and return the reordered
-// operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- const BoUpSLP &R) {
- if (VL.empty())
- return;
- VLOperands Ops(VL, R);
- // Reorder the operands in place.
- Ops.reorder();
- Left = Ops.getVL(0);
- Right = Ops.getVL(1);
-}
-
Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
if (Res)
@@ -14481,10 +14580,10 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
VE->isSame(TE->Scalars);
}));
};
- TreeEntry *VE = getTreeEntry(S.OpValue);
+ TreeEntry *VE = getTreeEntry(S.getMainOp());
if (VE && CheckSameVE(VE))
return VE;
- auto It = MultiNodeScalars.find(S.OpValue);
+ auto It = MultiNodeScalars.find(S.getMainOp());
if (It != MultiNodeScalars.end()) {
auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
return TE != VE && CheckSameVE(TE);
@@ -14862,7 +14961,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
// non-poisonous, or by freezing the incoming scalar value first.
auto *It = find_if(Scalars, [this, E](Value *V) {
return !isa<UndefValue>(V) &&
- (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
+ (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) ||
(E->UserTreeIndices.size() == 1 &&
any_of(V->uses(), [E](const Use &U) {
// Check if the value already used in the same operation in
@@ -14934,11 +15033,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
if (Vec2) {
IsUsedInExpr = false;
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
+ IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
+ isGuaranteedNotToBePoison(Vec2, AC);
ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
} else if (Vec1) {
- bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1);
+ bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
IsUsedInExpr &= FindReusedSplat(
ExtractMask,
cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
@@ -14969,7 +15068,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
if (TEs.size() == 1) {
bool IsNotPoisonedVec =
TEs.front()->VectorizedValue
- ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue)
+ ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
: true;
IsUsedInExpr &=
FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
@@ -14981,8 +15080,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
IsNonPoisoned &=
- isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
- isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
+ isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
}
}
}
@@ -15133,7 +15232,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
return Vec;
}
- bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
+ bool IsReverseOrder =
+ !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
if (E->getOpcode() == Instruction::Store &&
@@ -15316,7 +15416,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
}
if (!IsIdentity || NumElts != NumScalars) {
Value *V2 = nullptr;
- bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
+ bool IsVNonPoisonous =
+ !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
SmallVector<int> InsertMask(Mask);
if (NumElts != NumScalars && Offset == 0) {
// Follow all insert element instructions from the current buildvector
@@ -15519,6 +15620,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
+ ICmp->setSameSign(/*B=*/false);
// Do not cast for cmps.
VecTy = cast<FixedVectorType>(V->getType());
V = FinalShuffle(V, E);
@@ -15881,9 +15984,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- SmallVector<Type *> ArgTys =
- buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
- It != MinBWs.end() ? It->second.first : 0);
+ SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
+ CI, ID, VecTy->getNumElements(),
+ It != MinBWs.end() ? It->second.first : 0, TTI);
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
VecCallCosts.first <= VecCallCosts.second;
@@ -15899,7 +16002,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
- if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
ScalarArg = CEI->getArgOperand(I);
// if decided to reduce bitwidth of abs intrinsic, it second argument
// must be set false (do not return poison, if value issigned min).
@@ -16214,6 +16317,11 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
}
Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
+ if (auto *VecI = dyn_cast<Instruction>(Vec);
+ VecI && VecI->getParent() == Builder.GetInsertBlock() &&
+ Builder.GetInsertPoint()->comesBefore(VecI))
+ VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
+ Builder.GetInsertPoint());
if (Vec->getType() != PrevVec->getType()) {
assert(Vec->getType()->isIntOrIntVectorTy() &&
PrevVec->getType()->isIntOrIntVectorTy() &&
@@ -16433,7 +16541,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
E->State == TreeEntry::StridedVectorize) &&
doesInTreeUserNeedToExtract(
Scalar, getRootEntryInstruction(*UseEntry),
- TLI);
+ TLI, TTI);
})) &&
"Scalar with nullptr User must be registered in "
"ExternallyUsedValues map or remain as scalar in vectorized "
@@ -16966,13 +17074,13 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
- if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
- doesNotNeedToSchedule(VL))
+ if (isa<PHINode>(S.getMainOp()) ||
+ isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL))
return nullptr;
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
- LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
ScheduleData *Bundle) {
@@ -17053,7 +17161,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
auto *Bundle = buildBundle(VL);
TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle->isReady()) {
- cancelScheduling(VL, S.OpValue);
+ cancelScheduling(VL, S.getMainOp());
return std::nullopt;
}
return Bundle;
@@ -17574,8 +17682,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
bool BoUpSLP::collectValuesToDemote(
const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
- unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
- bool IsTruncRoot) const {
+ const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
+ bool &IsProfitableToDemote, bool IsTruncRoot) const {
// We can always demote constants.
if (all_of(E.Scalars, IsaPred<Constant>))
return true;
@@ -17587,6 +17695,10 @@ bool BoUpSLP::collectValuesToDemote(
return true;
}
+ // Check if the node was analyzed already and must keep its original bitwidth.
+ if (NodesToKeepBWs.contains(E.Idx))
+ return false;
+
// If the value is not a vectorized instruction in the expression and not used
// by the insertelement instruction and not used in multiple vector nodes, it
// cannot be demoted.
@@ -17682,8 +17794,8 @@ bool BoUpSLP::collectValuesToDemote(
for (const TreeEntry *Op : Operands) {
unsigned Level = InitLevel;
if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
- ToDemote, Visited, Level, IsProfitableToDemote,
- IsTruncRoot)) {
+ ToDemote, Visited, NodesToKeepBWs, Level,
+ IsProfitableToDemote, IsTruncRoot)) {
if (!IsProfitableToDemote)
return false;
NeedToExit = true;
@@ -17929,7 +18041,8 @@ bool BoUpSLP::collectValuesToDemote(
// Choose the best bitwidth based on cost estimations.
auto Checker = [&](unsigned BitWidth, unsigned) {
unsigned MinBW = PowerOf2Ceil(BitWidth);
- SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
+ SmallVector<Type *> ArgTys =
+ buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
auto VecCallCosts = getVectorCallCosts(
IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
TTI, TLI, ArgTys);
@@ -17985,6 +18098,7 @@ void BoUpSLP::computeMinimumValueSizes() {
bool IsTruncRoot = false;
bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
SmallVector<unsigned> RootDemotes;
+ SmallDenseSet<unsigned, 8> NodesToKeepBWs;
if (NodeIdx != 0 &&
VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
@@ -18008,6 +18122,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// Check if the root is trunc and the next node is gather/buildvector, then
// keep trunc in scalars, which is free in most cases.
if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
+ !NodesToKeepBWs.contains(E.Idx) &&
E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
all_of(E.Scalars, [&](Value *V) {
return V->hasOneUse() || isa<Constant>(V) ||
@@ -18130,8 +18245,8 @@ void BoUpSLP::computeMinimumValueSizes() {
bool NeedToDemote = IsProfitableToDemote;
if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
- ToDemote, Visited, MaxDepthLevel, NeedToDemote,
- IsTruncRoot) ||
+ ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
+ NeedToDemote, IsTruncRoot) ||
(MaxDepthLevel <= Limit &&
!(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
(!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
@@ -18265,7 +18380,7 @@ void BoUpSLP::computeMinimumValueSizes() {
});
}
- // If the maximum bit width we compute is less than the with of the roots'
+ // If the maximum bit width we compute is less than the width of the roots'
// type, we can proceed with the narrowing. Otherwise, do nothing.
if (MaxBitWidth == 0 ||
MaxBitWidth >=
@@ -18273,6 +18388,7 @@ void BoUpSLP::computeMinimumValueSizes() {
->getBitWidth()) {
if (UserIgnoreList)
AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
+ NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
continue;
}
@@ -18432,7 +18548,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
(VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
if ((!IsAllowedSize && S.getOpcode() &&
S.getOpcode() != Instruction::Load &&
- (!S.MainOp->isSafeToRemove() ||
+ (!S.getMainOp()->isSafeToRemove() ||
any_of(ValOps.getArrayRef(),
[&](Value *V) {
return !isa<ExtractElementInst>(V) &&
@@ -18969,7 +19085,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (!S.getOpcode())
return false;
- Instruction *I0 = cast<Instruction>(S.OpValue);
+ Instruction *I0 = S.getMainOp();
// Make sure invalid types (including vector type) are rejected before
// determining vectorization factor for scalar instructions.
for (Value *V : VL) {
@@ -19381,7 +19497,7 @@ public:
// %3 = extractelement <2 x i32> %a, i32 0
// %4 = extractelement <2 x i32> %a, i32 1
// %select = select i1 %cond, i32 %3, i32 %4
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
Instruction *L1;
Instruction *L2;
@@ -19656,7 +19772,7 @@ public:
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
- const TargetLibraryInfo &TLI) {
+ const TargetLibraryInfo &TLI, AssumptionCache *AC) {
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
@@ -19700,20 +19816,35 @@ public:
return cast<Instruction>(ScalarCond);
};
+ bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
+ return isBoolLogicOp(cast<Instruction>(V));
+ });
// Return new VectorizedTree, based on previous value.
auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
if (VectorizedTree) {
// Update the final value in the reduction.
Builder.SetCurrentDebugLocation(
cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
- if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
- (isGuaranteedNotToBePoison(Res) &&
- !isGuaranteedNotToBePoison(VectorizedTree))) {
- auto It = ReducedValsToOps.find(Res);
- if (It != ReducedValsToOps.end() &&
- any_of(It->getSecond(),
- [](Instruction *I) { return isBoolLogicOp(I); }))
+ if (AnyBoolLogicOp) {
+ auto It = ReducedValsToOps.find(VectorizedTree);
+ auto It1 = ReducedValsToOps.find(Res);
+ if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
+ isGuaranteedNotToBePoison(VectorizedTree, AC) ||
+ (It != ReducedValsToOps.end() &&
+ any_of(It->getSecond(), [&](Instruction *I) {
+ return isBoolLogicOp(I) &&
+ getRdxOperand(I, 0) == VectorizedTree;
+ }))) {
+ ;
+ } else if (isGuaranteedNotToBePoison(Res, AC) ||
+ (It1 != ReducedValsToOps.end() &&
+ any_of(It1->getSecond(), [&](Instruction *I) {
+ return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
+ }))) {
std::swap(VectorizedTree, Res);
+ } else {
+ VectorizedTree = Builder.CreateFreeze(VectorizedTree);
+ }
}
return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
@@ -19722,9 +19853,6 @@ public:
// Initialize the final value in the reduction.
return Res;
};
- bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
- return isBoolLogicOp(cast<Instruction>(V));
- });
SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
ReductionOps.front().size());
for (ReductionOpsType &RdxOps : ReductionOps)
@@ -19801,7 +19929,7 @@ public:
TrackedToOrig.try_emplace(RdxVal, RV);
}
SmallVector<int> Mask;
- if (isFixedVectorShuffle(CommonCandidates, Mask)) {
+ if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
++I;
Candidates.swap(CommonCandidates);
ShuffledExtracts = true;
@@ -20116,7 +20244,7 @@ public:
// To prevent poison from leaking across what used to be sequential,
// safe, scalar boolean logic operations, the reduction operand must be
// frozen.
- if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot))
+ if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
// Emit code to correctly handle reused reduced values, if required.
@@ -20223,13 +20351,13 @@ public:
bool InitStep) {
if (!AnyBoolLogicOp)
return;
- if (isBoolLogicOp(RedOp1) &&
- ((!InitStep && LHS == VectorizedTree) ||
- getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
+ if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
+ getRdxOperand(RedOp1, 0) == LHS ||
+ isGuaranteedNotToBePoison(LHS, AC)))
return;
if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
getRdxOperand(RedOp2, 0) == RHS ||
- isGuaranteedNotToBePoison(RHS))) {
+ isGuaranteedNotToBePoison(RHS, AC))) {
std::swap(LHS, RHS);
return;
}
@@ -20515,6 +20643,8 @@ private:
case RecurKind::FMulAdd:
case RecurKind::IAnyOf:
case RecurKind::FAnyOf:
+ case RecurKind::IFindLastIV:
+ case RecurKind::FFindLastIV:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for repeated scalar.");
}
@@ -20612,6 +20742,8 @@ private:
case RecurKind::FMulAdd:
case RecurKind::IAnyOf:
case RecurKind::FAnyOf:
+ case RecurKind::IFindLastIV:
+ case RecurKind::FFindLastIV:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for reused scalars.");
}
@@ -20873,7 +21005,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
HorizontalReduction HorRdx;
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
- return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
+ return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
};
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -20979,8 +21111,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
SmallVector<Value *, 16> BuildVectorOpds;
SmallVector<int> Mask;
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
- (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
- isFixedVectorShuffle(BuildVectorOpds, Mask)))
+ (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
+ isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
return false;
if (MaxVFOnly && BuildVectorInsts.size() == 2) {
@@ -21198,8 +21330,11 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
if (R.isDeleted(I))
continue;
for (Value *Op : I->operands())
- if (auto *RootOp = dyn_cast<Instruction>(Op))
+ if (auto *RootOp = dyn_cast<Instruction>(Op)) {
Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
+ if (R.isDeleted(I))
+ break;
+ }
}
// Try to vectorize operands as vector bundles.
for (CmpInst *I : CmpInsts) {
@@ -21735,9 +21870,6 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
V2->getValueOperand()->getType()->getScalarSizeInBits())
return false;
// UndefValues are compatible with all other values.
- if (isa<UndefValue>(V->getValueOperand()) ||
- isa<UndefValue>(V2->getValueOperand()))
- return false;
if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
@@ -21751,14 +21883,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
"Different nodes should have different DFS numbers");
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
- InstructionsState S = getSameOpcode({I1, I2}, *TLI);
- if (S.getOpcode())
- return false;
return I1->getOpcode() < I2->getOpcode();
}
- if (isa<Constant>(V->getValueOperand()) &&
- isa<Constant>(V2->getValueOperand()))
- return false;
return V->getValueOperand()->getValueID() <
V2->getValueOperand()->getValueID();
};
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 4b0e12c28f07..ba62c45a4e70 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -325,6 +325,113 @@ void DependencyGraph::createNewNodes(const Interval<Instruction> &NewInterval) {
setDefUseUnscheduledSuccs(NewInterval);
}
+MemDGNode *DependencyGraph::getMemDGNodeBefore(DGNode *N,
+ bool IncludingN) const {
+ auto *I = N->getInstruction();
+ for (auto *PrevI = IncludingN ? I : I->getPrevNode(); PrevI != nullptr;
+ PrevI = PrevI->getPrevNode()) {
+ auto *PrevN = getNodeOrNull(PrevI);
+ if (PrevN == nullptr)
+ return nullptr;
+ if (auto *PrevMemN = dyn_cast<MemDGNode>(PrevN))
+ return PrevMemN;
+ }
+ return nullptr;
+}
+
+MemDGNode *DependencyGraph::getMemDGNodeAfter(DGNode *N,
+ bool IncludingN) const {
+ auto *I = N->getInstruction();
+ for (auto *NextI = IncludingN ? I : I->getNextNode(); NextI != nullptr;
+ NextI = NextI->getNextNode()) {
+ auto *NextN = getNodeOrNull(NextI);
+ if (NextN == nullptr)
+ return nullptr;
+ if (auto *NextMemN = dyn_cast<MemDGNode>(NextN))
+ return NextMemN;
+ }
+ return nullptr;
+}
+
+void DependencyGraph::notifyCreateInstr(Instruction *I) {
+ auto *MemN = dyn_cast<MemDGNode>(getOrCreateNode(I));
+ // TODO: Update the dependencies for the new node.
+
+ // Update the MemDGNode chain if this is a memory node.
+ if (MemN != nullptr) {
+ if (auto *PrevMemN = getMemDGNodeBefore(MemN, /*IncludingN=*/false)) {
+ PrevMemN->NextMemN = MemN;
+ MemN->PrevMemN = PrevMemN;
+ }
+ if (auto *NextMemN = getMemDGNodeAfter(MemN, /*IncludingN=*/false)) {
+ NextMemN->PrevMemN = MemN;
+ MemN->NextMemN = NextMemN;
+ }
+ }
+}
+
+void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) {
+ // Early return if `I` doesn't actually move.
+ BasicBlock *BB = To.getNodeParent();
+ if (To != BB->end() && &*To == I->getNextNode())
+ return;
+
+ // Maintain the DAGInterval.
+ DAGInterval.notifyMoveInstr(I, To);
+
+ // TODO: Perhaps check if this is legal by checking the dependencies?
+
+ // Update the MemDGNode chain to reflect the instr movement if necessary.
+ DGNode *N = getNodeOrNull(I);
+ if (N == nullptr)
+ return;
+ MemDGNode *MemN = dyn_cast<MemDGNode>(N);
+ if (MemN == nullptr)
+ return;
+ // First detach it from the existing chain.
+ MemN->detachFromChain();
+ // Now insert it back into the chain at the new location.
+ if (To != BB->end()) {
+ DGNode *ToN = getNodeOrNull(&*To);
+ if (ToN != nullptr) {
+ MemDGNode *PrevMemN = getMemDGNodeBefore(ToN, /*IncludingN=*/false);
+ MemDGNode *NextMemN = getMemDGNodeAfter(ToN, /*IncludingN=*/true);
+ MemN->PrevMemN = PrevMemN;
+ if (PrevMemN != nullptr)
+ PrevMemN->NextMemN = MemN;
+ MemN->NextMemN = NextMemN;
+ if (NextMemN != nullptr)
+ NextMemN->PrevMemN = MemN;
+ }
+ } else {
+ // MemN becomes the last instruction in the BB.
+ auto *TermN = getNodeOrNull(BB->getTerminator());
+ if (TermN != nullptr) {
+ MemDGNode *PrevMemN = getMemDGNodeBefore(TermN, /*IncludingN=*/false);
+ PrevMemN->NextMemN = MemN;
+ MemN->PrevMemN = PrevMemN;
+ } else {
+ // The terminator is outside the DAG interval so do nothing.
+ }
+ }
+}
+
+void DependencyGraph::notifyEraseInstr(Instruction *I) {
+ // Update the MemDGNode chain if this is a memory node.
+ if (auto *MemN = dyn_cast_or_null<MemDGNode>(getNodeOrNull(I))) {
+ auto *PrevMemN = getMemDGNodeBefore(MemN, /*IncludingN=*/false);
+ auto *NextMemN = getMemDGNodeAfter(MemN, /*IncludingN=*/false);
+ if (PrevMemN != nullptr)
+ PrevMemN->NextMemN = NextMemN;
+ if (NextMemN != nullptr)
+ NextMemN->PrevMemN = PrevMemN;
+ }
+
+ InstrToNodeMap.erase(I);
+
+ // TODO: Update the dependencies.
+}
+
Interval<Instruction> DependencyGraph::extend(ArrayRef<Instruction *> Instrs) {
if (Instrs.empty())
return {};
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b801d1863e25..6d02efc05614 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -170,9 +170,7 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
}
void VPBlockBase::setPlan(VPlan *ParentPlan) {
- assert(
- (ParentPlan->getEntry() == this || ParentPlan->getPreheader() == this) &&
- "Can only set plan on its entry or preheader block.");
+ assert(ParentPlan->getEntry() == this && "Can only set plan on its entry.");
Plan = ParentPlan;
}
@@ -207,11 +205,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
return Parent->getEnclosingBlockWithPredecessors();
}
-void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
- for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Entry)))
- delete Block;
-}
-
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
iterator It = begin();
while (It != end() && It->isPhi())
@@ -222,9 +215,11 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
ElementCount VF, unsigned UF, LoopInfo *LI,
DominatorTree *DT, IRBuilderBase &Builder,
- InnerLoopVectorizer *ILV, VPlan *Plan)
+ InnerLoopVectorizer *ILV, VPlan *Plan,
+ Loop *CurrentParentLoop, Type *CanonicalIVTy)
: TTI(TTI), VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
- LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {}
+ CurrentParentLoop(CurrentParentLoop), LVer(nullptr),
+ TypeAnalysis(CanonicalIVTy) {}
Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) {
if (Def->isLiveIn())
@@ -309,9 +304,8 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
if (!hasScalarValue(Def, LastLane)) {
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
// VPExpandSCEVRecipes can also be uniform.
- assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
- isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
- isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
+ assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe,
+ VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
"unexpected recipe found to be invariant");
IsUniform = true;
LastLane = 0;
@@ -360,7 +354,7 @@ void VPTransformState::addNewMetadata(Instruction *To,
const Instruction *Orig) {
// If the loop was versioned with memchecks, add the corresponding no-alias
// metadata.
- if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+ if (LVer && isa<LoadInst, StoreInst>(Orig))
LVer->annotateInstWithNoAlias(To, Orig);
}
@@ -476,6 +470,13 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
connectToPredecessors(State->CFG);
}
+VPIRBasicBlock *VPIRBasicBlock::clone() {
+ auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB);
+ for (VPRecipeBase &R : Recipes)
+ NewBlock->appendRecipe(R.clone());
+ return NewBlock;
+}
+
void VPBasicBlock::execute(VPTransformState *State) {
bool Replica = bool(State->Lane);
BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
@@ -502,8 +503,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
UnreachableInst *Terminator = State->Builder.CreateUnreachable();
// Register NewBB in its loop. In innermost loops its the same for all
// BB's.
- if (State->CurrentVectorLoop)
- State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
+ if (State->CurrentParentLoop)
+ State->CurrentParentLoop->addBasicBlockToLoop(NewBB, *State->LI);
State->Builder.SetInsertPoint(Terminator);
State->CFG.PrevBB = NewBB;
@@ -515,14 +516,11 @@ void VPBasicBlock::execute(VPTransformState *State) {
executeRecipes(State, NewBB);
}
-void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
- for (VPRecipeBase &R : Recipes) {
- for (auto *Def : R.definedValues())
- Def->replaceAllUsesWith(NewValue);
-
- for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
- R.setOperand(I, NewValue);
- }
+VPBasicBlock *VPBasicBlock::clone() {
+ auto *NewBlock = getPlan()->createVPBasicBlock(getName());
+ for (VPRecipeBase &R : *this)
+ NewBlock->appendRecipe(R.clone());
+ return NewBlock;
}
void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
@@ -543,7 +541,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
SmallVector<VPBlockBase *, 2> Succs(successors());
// Create new empty block after the block to split.
- auto *SplitBlock = new VPBasicBlock(getName() + ".split");
+ auto *SplitBlock = getPlan()->createVPBasicBlock(getName() + ".split");
VPBlockUtils::insertBlockAfter(SplitBlock, this);
// Finally, move the recipes starting at SplitAt to new block.
@@ -703,37 +701,30 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
VPRegionBlock *VPRegionBlock::clone() {
const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
- auto *NewRegion =
- new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator());
+ auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting,
+ getName(), isReplicator());
for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
Block->setParent(NewRegion);
return NewRegion;
}
-void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
- for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
- // Drop all references in VPBasicBlocks and replace all uses with
- // DummyValue.
- Block->dropAllReferences(NewValue);
-}
-
void VPRegionBlock::execute(VPTransformState *State) {
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
RPOT(Entry);
if (!isReplicator()) {
// Create and register the new vector loop.
- Loop *PrevLoop = State->CurrentVectorLoop;
- State->CurrentVectorLoop = State->LI->AllocateLoop();
+ Loop *PrevLoop = State->CurrentParentLoop;
+ State->CurrentParentLoop = State->LI->AllocateLoop();
BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()];
Loop *ParentLoop = State->LI->getLoopFor(VectorPH);
// Insert the new loop into the loop nest and register the new basic blocks
// before calling any utilities such as SCEV that require valid LoopInfo.
if (ParentLoop)
- ParentLoop->addChildLoop(State->CurrentVectorLoop);
+ ParentLoop->addChildLoop(State->CurrentParentLoop);
else
- State->LI->addTopLevelLoop(State->CurrentVectorLoop);
+ State->LI->addTopLevelLoop(State->CurrentParentLoop);
// Visit the VPBlocks connected to "this", starting from it.
for (VPBlockBase *Block : RPOT) {
@@ -741,7 +732,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
Block->execute(State);
}
- State->CurrentVectorLoop = PrevLoop;
+ State->CurrentParentLoop = PrevLoop;
return;
}
@@ -823,16 +814,27 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
}
#endif
-VPlan::~VPlan() {
- if (Entry) {
- VPValue DummyValue;
- for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
- Block->dropAllReferences(&DummyValue);
-
- VPBlockBase::deleteCFG(Entry);
+VPlan::VPlan(Loop *L) {
+ setEntry(createVPIRBasicBlock(L->getLoopPreheader()));
+ ScalarHeader = createVPIRBasicBlock(L->getHeader());
+}
- Preheader->dropAllReferences(&DummyValue);
- delete Preheader;
+VPlan::~VPlan() {
+ VPValue DummyValue;
+
+ for (auto *VPB : CreatedBlocks) {
+ if (auto *VPBB = dyn_cast<VPBasicBlock>(VPB)) {
+ // Replace all operands of recipes and all VPValues defined in VPBB with
+ // DummyValue so the block can be deleted.
+ for (VPRecipeBase &R : *VPBB) {
+ for (auto *Def : R.definedValues())
+ Def->replaceAllUsesWith(&DummyValue);
+
+ for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
+ R.setOperand(I, &DummyValue);
+ }
+ }
+ delete VPB;
}
for (VPValue *VPV : VPLiveInsToFree)
delete VPV;
@@ -840,34 +842,27 @@ VPlan::~VPlan() {
delete BackedgeTakenCount;
}
-VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) {
- auto *VPIRBB = new VPIRBasicBlock(IRBB);
- for (Instruction &I :
- make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
- VPIRBB->appendRecipe(new VPIRInstruction(I));
- return VPIRBB;
-}
-
VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
PredicatedScalarEvolution &PSE,
bool RequiresScalarEpilogueCheck,
bool TailFolded, Loop *TheLoop) {
- VPIRBasicBlock *Entry =
- VPIRBasicBlock::fromBasicBlock(TheLoop->getLoopPreheader());
- VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
- VPIRBasicBlock *ScalarHeader =
- VPIRBasicBlock::fromBasicBlock(TheLoop->getHeader());
- auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader);
+ auto Plan = std::make_unique<VPlan>(TheLoop);
+ VPBlockBase *ScalarHeader = Plan->getScalarHeader();
+
+ // Connect entry only to vector preheader initially. Entry will also be
+ // connected to the scalar preheader later, during skeleton creation when
+ // runtime guards are added as needed. Note that when executing the VPlan for
+ // an epilogue vector loop, the original entry block here will be replaced by
+ // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after
+ // generating code for the main vector loop.
+ VPBasicBlock *VecPreheader = Plan->createVPBasicBlock("vector.ph");
+ VPBlockUtils::connectBlocks(Plan->getEntry(), VecPreheader);
// Create SCEV and VPValue for the trip count.
-
- // Currently only loops with countable exits are vectorized, but calling
- // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
- // uncountable exits whilst also ensuring the symbolic maximum and known
- // back-edge taken count remain identical for loops with countable exits.
+ // We use the symbolic max backedge-taken-count, which works also when
+ // vectorizing loops with uncountable early exits.
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
- assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
- BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) &&
+ assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
"Invalid loop count");
ScalarEvolution &SE = *PSE.getSE();
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
@@ -877,17 +872,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// Create VPRegionBlock, with empty header and latch blocks, to be filled
// during processing later.
- VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
- VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
+ VPBasicBlock *HeaderVPBB = Plan->createVPBasicBlock("vector.body");
+ VPBasicBlock *LatchVPBB = Plan->createVPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
- auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop",
- false /*isReplicator*/);
+ auto *TopRegion = Plan->createVPRegionBlock(
+ HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/);
VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
- VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
+ VPBasicBlock *MiddleVPBB = Plan->createVPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBasicBlock *ScalarPH = Plan->createVPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
if (!RequiresScalarEpilogueCheck) {
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -902,8 +897,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// 2) If we require a scalar epilogue, there is no conditional branch as
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
- BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
- auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+ BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
+ auto *VPExitBlock = Plan->createVPIRBasicBlock(IRExitBlock);
// The connection order corresponds to the operands of the conditional branch.
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -927,7 +922,6 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
}
void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
- Value *CanonicalIVStartValue,
VPTransformState &State) {
Type *TCTy = TripCountV->getType();
// Check if the backedge taken count is needed, and if so build it.
@@ -953,41 +947,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
} else {
VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));
}
-
- // When vectorizing the epilogue loop, the canonical induction start value
- // needs to be changed from zero to the value after the main vector loop.
- // FIXME: Improve modeling for canonical IV start values in the epilogue loop.
- if (CanonicalIVStartValue) {
- VPValue *VPV = getOrAddLiveIn(CanonicalIVStartValue);
- auto *IV = getCanonicalIV();
- assert(all_of(IV->users(),
- [](const VPUser *U) {
- return isa<VPScalarIVStepsRecipe>(U) ||
- isa<VPScalarCastRecipe>(U) ||
- isa<VPDerivedIVRecipe>(U) ||
- cast<VPInstruction>(U)->getOpcode() ==
- Instruction::Add;
- }) &&
- "the canonical IV should only be used by its increment or "
- "ScalarIVSteps when resetting the start value");
- IV->setOperand(0, VPV);
- }
-}
-
-/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
-/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
-/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
-/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
- VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB);
- for (auto &R : make_early_inc_range(*VPBB)) {
- assert(!R.isPhi() && "Tried to move phi recipe to end of block");
- R.moveBefore(*IRVPBB, IRVPBB->end());
- }
-
- VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
-
- delete VPBB;
}
/// Generate the code inside the preheader and body of the vectorized loop.
@@ -997,27 +956,23 @@ void VPlan::execute(VPTransformState *State) {
// Initialize CFG state.
State->CFG.PrevVPBB = nullptr;
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
- BasicBlock *VectorPreHeader = State->CFG.PrevBB;
- State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
// Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
+ BasicBlock *VectorPreHeader = State->CFG.PrevBB;
cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);
State->CFG.DTU.applyUpdates(
{{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
- // Replace regular VPBB's for the middle and scalar preheader blocks with
- // VPIRBasicBlocks wrapping their IR blocks. The IR blocks are created during
- // skeleton creation, so we can only create the VPIRBasicBlocks now during
- // VPlan execution rather than earlier during VPlan construction.
- BasicBlock *MiddleBB = State->CFG.ExitBB;
- VPBasicBlock *MiddleVPBB = getMiddleBlock();
- BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
- replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh);
- replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
+ LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
+ << ", UF=" << getUF() << '\n');
+ setName("Final VPlan");
+ LLVM_DEBUG(dump());
// Disconnect the middle block from its single successor (the scalar loop
// header) in both the CFG and DT. The branch will be recreated during VPlan
// execution.
+ BasicBlock *MiddleBB = State->CFG.ExitBB;
+ BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
auto *BrInst = new UnreachableInst(MiddleBB->getContext());
BrInst->insertBefore(MiddleBB->getTerminator());
MiddleBB->getTerminator()->eraseFromParent();
@@ -1028,8 +983,11 @@ void VPlan::execute(VPTransformState *State) {
State->CFG.DTU.applyUpdates(
{{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
- // Generate code in the loop pre-header and body.
- for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ Entry);
+ // Generate code for the VPlan, in parts of the vector skeleton, loop body and
+ // successor blocks including the middle, exit and scalar preheader blocks.
+ for (VPBlockBase *Block : RPOT)
Block->execute(State);
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
@@ -1043,8 +1001,7 @@ void VPlan::execute(VPTransformState *State) {
if (isa<VPWidenPHIRecipe>(&R))
continue;
- if (isa<VPWidenPointerInductionRecipe>(&R) ||
- isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ if (isa<VPWidenInductionRecipe>(&R)) {
PHINode *Phi = nullptr;
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
@@ -1079,9 +1036,6 @@ void VPlan::execute(VPTransformState *State) {
}
State->CFG.DTU.flush();
- assert(State->CFG.DTU.getDomTree().verify(
- DominatorTree::VerificationLevel::Fast) &&
- "DT not preserved correctly");
}
InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
@@ -1090,6 +1044,21 @@ InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
return getVectorLoopRegion()->cost(VF, Ctx);
}
+VPRegionBlock *VPlan::getVectorLoopRegion() {
+ // TODO: Cache if possible.
+ for (VPBlockBase *B : vp_depth_first_shallow(getEntry()))
+ if (auto *R = dyn_cast<VPRegionBlock>(B))
+ return R;
+ return nullptr;
+}
+
+const VPRegionBlock *VPlan::getVectorLoopRegion() const {
+ for (const VPBlockBase *B : vp_depth_first_shallow(getEntry()))
+ if (auto *R = dyn_cast<VPRegionBlock>(B))
+ return R;
+ return nullptr;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPlan::printLiveIns(raw_ostream &O) const {
VPSlotTracker SlotTracker(this);
@@ -1134,12 +1103,9 @@ void VPlan::print(raw_ostream &O) const {
printLiveIns(O);
- if (!getPreheader()->empty()) {
- O << "\n";
- getPreheader()->print(O, "", SlotTracker);
- }
-
- for (const VPBlockBase *Block : vp_depth_first_shallow(getEntry())) {
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<const VPBlockBase *>>
+ RPOT(getEntry());
+ for (const VPBlockBase *Block : RPOT) {
O << '\n';
Block->print(O, "", SlotTracker);
}
@@ -1219,8 +1185,8 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
}
VPlan *VPlan::duplicate() {
+ unsigned NumBlocksBeforeCloning = CreatedBlocks.size();
// Clone blocks.
- VPBasicBlock *NewPreheader = Preheader->clone();
const auto &[NewEntry, __] = cloneFrom(Entry);
BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
@@ -1230,8 +1196,7 @@ VPlan *VPlan::duplicate() {
return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
}));
// Create VPlan, clone live-ins and remap operands in the cloned blocks.
- auto *NewPlan =
- new VPlan(NewPreheader, cast<VPBasicBlock>(NewEntry), NewScalarHeader);
+ auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader);
DenseMap<VPValue *, VPValue *> Old2NewVPValues;
for (VPValue *OldLiveIn : VPLiveInsToFree) {
Old2NewVPValues[OldLiveIn] =
@@ -1251,7 +1216,6 @@ VPlan *VPlan::duplicate() {
// else NewTripCount will be created and inserted into Old2NewVPValues when
// TripCount is cloned. In any case NewPlan->TripCount is updated below.
- remapOperands(Preheader, NewPreheader, Old2NewVPValues);
remapOperands(Entry, NewEntry, Old2NewVPValues);
// Initialize remaining fields of cloned VPlan.
@@ -1262,9 +1226,32 @@ VPlan *VPlan::duplicate() {
assert(Old2NewVPValues.contains(TripCount) &&
"TripCount must have been added to Old2NewVPValues");
NewPlan->TripCount = Old2NewVPValues[TripCount];
+
+ // Transfer all cloned blocks (the second half of all current blocks) from
+ // current to new VPlan.
+ unsigned NumBlocksAfterCloning = CreatedBlocks.size();
+ for (unsigned I :
+ seq<unsigned>(NumBlocksBeforeCloning, NumBlocksAfterCloning))
+ NewPlan->CreatedBlocks.push_back(this->CreatedBlocks[I]);
+ CreatedBlocks.truncate(NumBlocksBeforeCloning);
+
return NewPlan;
}
+VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) {
+ auto *VPIRBB = new VPIRBasicBlock(IRBB);
+ CreatedBlocks.push_back(VPIRBB);
+ return VPIRBB;
+}
+
+VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) {
+ auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB);
+ for (Instruction &I :
+ make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
+ VPIRBB->appendRecipe(new VPIRInstruction(I));
+ return VPIRBB;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
@@ -1303,8 +1290,6 @@ void VPlanPrinter::dump() {
OS << "edge [fontname=Courier, fontsize=30]\n";
OS << "compound=true\n";
- dumpBlock(Plan.getPreheader());
-
for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
dumpBlock(Block);
@@ -1565,7 +1550,6 @@ void VPSlotTracker::assignNames(const VPlan &Plan) {
assignName(Plan.BackedgeTakenCount);
for (VPValue *LI : Plan.VPLiveInsToFree)
assignName(LI);
- assignNames(Plan.getPreheader());
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1d828f038f9..88f3f672d3aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -236,7 +236,8 @@ public:
struct VPTransformState {
VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF,
LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
- InnerLoopVectorizer *ILV, VPlan *Plan);
+ InnerLoopVectorizer *ILV, VPlan *Plan,
+ Loop *CurrentParentLoop, Type *CanonicalIVTy);
/// Target Transform Info.
const TargetTransformInfo *TTI;
@@ -373,8 +374,8 @@ struct VPTransformState {
/// Pointer to the VPlan code is generated for.
VPlan *Plan;
- /// The loop object for the current parent region, or nullptr.
- Loop *CurrentVectorLoop = nullptr;
+ /// The parent loop object for the current scope, or nullptr.
+ Loop *CurrentParentLoop = nullptr;
/// LoopVersioning. It's only set up (non-null) if memchecks were
/// used.
@@ -621,6 +622,14 @@ public:
/// Remove all the successors of this block.
void clearSuccessors() { Successors.clear(); }
+ /// Swap successors of the block. The block must have exactly 2 successors.
+ // TODO: This should be part of introducing conditional branch recipes rather
+ // than being independent.
+ void swapSuccessors() {
+ assert(Successors.size() == 2 && "must have 2 successors to swap");
+ std::swap(Successors[0], Successors[1]);
+ }
+
/// The method which generates the output IR that correspond to this
/// VPBlockBase, thereby "executing" the VPlan.
virtual void execute(VPTransformState *State) = 0;
@@ -628,9 +637,6 @@ public:
/// Return the cost of the block.
virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
- /// Delete all blocks reachable from a given VPBlockBase, inclusive.
- static void deleteCFG(VPBlockBase *Entry);
-
/// Return true if it is legal to hoist instructions into this block.
bool isLegalToHoistInto() {
// There are currently no constraints that prevent an instruction to be
@@ -638,10 +644,6 @@ public:
return true;
}
- /// Replace all operands of VPUsers in the block with \p NewValue and also
- /// replaces all uses of VPValues defined in the block with NewValue.
- virtual void dropAllReferences(VPValue *NewValue) = 0;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void printAsOperand(raw_ostream &OS, bool PrintType = false) const {
OS << getName();
@@ -944,11 +946,6 @@ public:
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
};
- struct GEPFlagsTy {
- char IsInBounds : 1;
- GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
- };
-
private:
struct ExactFlagsTy {
char IsExact : 1;
@@ -975,7 +972,7 @@ private:
WrapFlagsTy WrapFlags;
DisjointFlagsTy DisjointFlags;
ExactFlagsTy ExactFlags;
- GEPFlagsTy GEPFlags;
+ GEPNoWrapFlags GEPFlags;
NonNegFlagsTy NonNegFlags;
FastMathFlagsTy FMFs;
unsigned AllFlags;
@@ -1012,7 +1009,7 @@ public:
ExactFlags.IsExact = Op->isExact();
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
OpType = OperationType::GEPOp;
- GEPFlags.IsInBounds = GEP->isInBounds();
+ GEPFlags = GEP->getNoWrapFlags();
} else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) {
OpType = OperationType::NonNegOp;
NonNegFlags.NonNeg = PNNI->hasNonNeg();
@@ -1052,7 +1049,7 @@ public:
protected:
template <typename IterT>
VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
- GEPFlagsTy GEPFlags, DebugLoc DL = {})
+ GEPNoWrapFlags GEPFlags, DebugLoc DL = {})
: VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp),
GEPFlags(GEPFlags) {}
@@ -1089,7 +1086,7 @@ public:
ExactFlags.IsExact = false;
break;
case OperationType::GEPOp:
- GEPFlags.IsInBounds = false;
+ GEPFlags = GEPNoWrapFlags::none();
break;
case OperationType::FPMathOp:
FMFs.NoNaNs = false;
@@ -1118,10 +1115,7 @@ public:
I->setIsExact(ExactFlags.IsExact);
break;
case OperationType::GEPOp:
- // TODO(gep_nowrap): Track the full GEPNoWrapFlags in VPlan.
- cast<GetElementPtrInst>(I)->setNoWrapFlags(
- GEPFlags.IsInBounds ? GEPNoWrapFlags::inBounds()
- : GEPNoWrapFlags::none());
+ cast<GetElementPtrInst>(I)->setNoWrapFlags(GEPFlags);
break;
case OperationType::FPMathOp:
I->setHasAllowReassoc(FMFs.AllowReassoc);
@@ -1147,11 +1141,7 @@ public:
return CmpPredicate;
}
- bool isInBounds() const {
- assert(OpType == OperationType::GEPOp &&
- "recipe doesn't have inbounds flag");
- return GEPFlags.IsInBounds;
- }
+ GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; }
/// Returns true if the recipe has fast-math flags.
bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; }
@@ -1232,6 +1222,9 @@ public:
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
+ // Returns a scalar boolean value, which is true if any lane of its single
+ // operand is true.
+ AnyOf,
};
private:
@@ -1295,7 +1288,7 @@ public:
assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
}
- VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags,
+ VPInstruction(VPValue *Ptr, VPValue *Offset, GEPNoWrapFlags Flags,
DebugLoc DL = {}, const Twine &Name = "")
: VPRecipeWithIRFlags(VPDef::VPInstructionSC,
ArrayRef<VPValue *>({Ptr, Offset}), Flags, DL),
@@ -1336,14 +1329,6 @@ public:
LLVM_DUMP_METHOD void dump() const;
#endif
- /// Return true if this instruction may modify memory.
- bool mayWriteToMemory() const {
- // TODO: we can use attributes of the called function to rule out memory
- // modifications.
- return Opcode == Instruction::Store || Opcode == Instruction::Call ||
- Opcode == Instruction::Invoke || Opcode == SLPStore;
- }
-
bool hasResult() const {
// CallInst may or may not have a result, depending on the called function.
// Conservatively return calls have results for now.
@@ -1662,7 +1647,7 @@ public:
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
DebugLoc DL = {})
- : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
+ : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
LLVMContext &Ctx = Ty->getContext();
AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID);
@@ -1697,6 +1682,9 @@ public:
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override;
+ /// Return the ID of the intrinsic.
+ Intrinsic::ID getVectorIntrinsicID() const { return VectorIntrinsicID; }
+
/// Return the scalar return type of the intrinsic.
Type *getResultType() const { return ResultTy; }
@@ -1911,10 +1899,9 @@ class VPReverseVectorPointerRecipe : public VPRecipeWithIRFlags,
public:
VPReverseVectorPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
- bool IsInBounds, DebugLoc DL)
+ GEPNoWrapFlags GEPFlags, DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPReverseVectorPointerSC,
- ArrayRef<VPValue *>({Ptr, VF}),
- GEPFlagsTy(IsInBounds), DL),
+ ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
IndexedTy(IndexedTy) {}
VP_CLASSOF_IMPL(VPDef::VPReverseVectorPointerSC)
@@ -1946,8 +1933,9 @@ public:
}
VPReverseVectorPointerRecipe *clone() override {
- return new VPReverseVectorPointerRecipe(
- getOperand(0), getVFValue(), IndexedTy, isInBounds(), getDebugLoc());
+ return new VPReverseVectorPointerRecipe(getOperand(0), getVFValue(),
+ IndexedTy, getGEPNoWrapFlags(),
+ getDebugLoc());
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1963,10 +1951,10 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
Type *IndexedTy;
public:
- VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsInBounds,
+ VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
- GEPFlagsTy(IsInBounds), DL),
+ GEPFlags, DL),
IndexedTy(IndexedTy) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
@@ -1988,8 +1976,8 @@ public:
}
VPVectorPointerRecipe *clone() override {
- return new VPVectorPointerRecipe(getOperand(0), IndexedTy, isInBounds(),
- getDebugLoc());
+ return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
+ getGEPNoWrapFlags(), getDebugLoc());
}
/// Return the cost of this VPHeaderPHIRecipe.
@@ -2088,28 +2076,72 @@ public:
}
};
+/// Base class for widened induction (VPWidenIntOrFpInductionRecipe and
+/// VPWidenPointerInductionRecipe), providing shared functionality, including
+/// retrieving the step value, induction descriptor and original phi node.
+class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
+ const InductionDescriptor &IndDesc;
+
+public:
+ VPWidenInductionRecipe(unsigned char Kind, PHINode *IV, VPValue *Start,
+ VPValue *Step, const InductionDescriptor &IndDesc,
+ DebugLoc DL)
+ : VPHeaderPHIRecipe(Kind, IV, Start, DL), IndDesc(IndDesc) {
+ addOperand(Step);
+ }
+
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPDef::VPWidenIntOrFpInductionSC ||
+ R->getVPDefID() == VPDef::VPWidenPointerInductionSC;
+ }
+
+ virtual void execute(VPTransformState &State) override = 0;
+
+ /// Returns the step value of the induction.
+ VPValue *getStepValue() { return getOperand(1); }
+ const VPValue *getStepValue() const { return getOperand(1); }
+
+ PHINode *getPHINode() const { return cast<PHINode>(getUnderlyingValue()); }
+
+ /// Returns the induction descriptor for the recipe.
+ const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
+
+ VPValue *getBackedgeValue() override {
+ // TODO: All operands of base recipe must exist and be at same index in
+ // derived recipe.
+ llvm_unreachable(
+ "VPWidenIntOrFpInductionRecipe generates its own backedge value");
+ }
+
+ VPRecipeBase &getBackedgeRecipe() override {
+ // TODO: All operands of base recipe must exist and be at same index in
+ // derived recipe.
+ llvm_unreachable(
+ "VPWidenIntOrFpInductionRecipe generates its own backedge value");
+ }
+};
+
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector values.
-class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
- PHINode *IV;
+class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
TruncInst *Trunc;
- const InductionDescriptor &IndDesc;
public:
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
- VPValue *VF, const InductionDescriptor &IndDesc)
- : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV),
- Trunc(nullptr), IndDesc(IndDesc) {
- addOperand(Step);
+ VPValue *VF, const InductionDescriptor &IndDesc,
+ DebugLoc DL)
+ : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
+ Step, IndDesc, DL),
+ Trunc(nullptr) {
addOperand(VF);
}
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
VPValue *VF, const InductionDescriptor &IndDesc,
- TruncInst *Trunc)
- : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start),
- IV(IV), Trunc(Trunc), IndDesc(IndDesc) {
- addOperand(Step);
+ TruncInst *Trunc, DebugLoc DL)
+ : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
+ Step, IndDesc, DL),
+ Trunc(Trunc) {
addOperand(VF);
}
@@ -2117,7 +2149,8 @@ public:
VPWidenIntOrFpInductionRecipe *clone() override {
return new VPWidenIntOrFpInductionRecipe(
- IV, getStartValue(), getStepValue(), getVFValue(), IndDesc, Trunc);
+ getPHINode(), getStartValue(), getStepValue(), getVFValue(),
+ getInductionDescriptor(), Trunc, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
@@ -2132,24 +2165,6 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
- VPValue *getBackedgeValue() override {
- // TODO: All operands of base recipe must exist and be at same index in
- // derived recipe.
- llvm_unreachable(
- "VPWidenIntOrFpInductionRecipe generates its own backedge value");
- }
-
- VPRecipeBase &getBackedgeRecipe() override {
- // TODO: All operands of base recipe must exist and be at same index in
- // derived recipe.
- llvm_unreachable(
- "VPWidenIntOrFpInductionRecipe generates its own backedge value");
- }
-
- /// Returns the step value of the induction.
- VPValue *getStepValue() { return getOperand(1); }
- const VPValue *getStepValue() const { return getOperand(1); }
-
VPValue *getVFValue() { return getOperand(2); }
const VPValue *getVFValue() const { return getOperand(2); }
@@ -2164,11 +2179,6 @@ public:
TruncInst *getTruncInst() { return Trunc; }
const TruncInst *getTruncInst() const { return Trunc; }
- PHINode *getPHINode() { return IV; }
-
- /// Returns the induction descriptor for the recipe.
- const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
-
/// Returns true if the induction is canonical, i.e. starting at 0 and
/// incremented by UF * VF (= the original IV is incremented by 1) and has the
/// same type as the canonical induction.
@@ -2176,7 +2186,7 @@ public:
/// Returns the scalar type of the induction.
Type *getScalarType() const {
- return Trunc ? Trunc->getType() : IV->getType();
+ return Trunc ? Trunc->getType() : getPHINode()->getType();
}
/// Returns the VPValue representing the value of this induction at
@@ -2187,10 +2197,8 @@ public:
}
};
-class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe,
+class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,
public VPUnrollPartAccessor<3> {
- const InductionDescriptor &IndDesc;
-
bool IsScalarAfterVectorization;
public:
@@ -2198,20 +2206,17 @@ public:
/// Start.
VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step,
const InductionDescriptor &IndDesc,
- bool IsScalarAfterVectorization)
- : VPHeaderPHIRecipe(VPDef::VPWidenPointerInductionSC, Phi),
- IndDesc(IndDesc),
- IsScalarAfterVectorization(IsScalarAfterVectorization) {
- addOperand(Start);
- addOperand(Step);
- }
+ bool IsScalarAfterVectorization, DebugLoc DL)
+ : VPWidenInductionRecipe(VPDef::VPWidenPointerInductionSC, Phi, Start,
+ Step, IndDesc, DL),
+ IsScalarAfterVectorization(IsScalarAfterVectorization) {}
~VPWidenPointerInductionRecipe() override = default;
VPWidenPointerInductionRecipe *clone() override {
return new VPWidenPointerInductionRecipe(
cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1),
- IndDesc, IsScalarAfterVectorization);
+ getInductionDescriptor(), IsScalarAfterVectorization, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)
@@ -2222,9 +2227,6 @@ public:
/// Returns true if only scalar values will be generated.
bool onlyScalarsGenerated(bool IsScalable);
- /// Returns the induction descriptor for the recipe.
- const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
-
/// Returns the VPValue representing the value of this induction at
/// the first unrolled part, if it exists. Returns itself if unrolling did not
/// take place.
@@ -2589,8 +2591,9 @@ class VPReductionRecipe : public VPSingleDefRecipe {
protected:
VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R,
Instruction *I, ArrayRef<VPValue *> Operands,
- VPValue *CondOp, bool IsOrdered)
- : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) {
+ VPValue *CondOp, bool IsOrdered, DebugLoc DL)
+ : VPSingleDefRecipe(SC, Operands, I, DL), RdxDesc(R),
+ IsOrdered(IsOrdered) {
if (CondOp) {
IsConditional = true;
addOperand(CondOp);
@@ -2600,16 +2603,17 @@ protected:
public:
VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered)
+ bool IsOrdered, DebugLoc DL = {})
: VPReductionRecipe(VPDef::VPReductionSC, R, I,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
- IsOrdered) {}
+ IsOrdered, DL) {}
~VPReductionRecipe() override = default;
VPReductionRecipe *clone() override {
return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
- getVecOp(), getCondOp(), IsOrdered);
+ getVecOp(), getCondOp(), IsOrdered,
+ getDebugLoc());
}
static inline bool classof(const VPRecipeBase *R) {
@@ -2664,7 +2668,7 @@ public:
VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(),
cast_or_null<Instruction>(R.getUnderlyingValue()),
ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
- R.isOrdered()) {}
+ R.isOrdered(), R.getDebugLoc()) {}
~VPReductionEVLRecipe() override = default;
@@ -2834,12 +2838,12 @@ class VPPredInstPHIRecipe : public VPSingleDefRecipe {
public:
/// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
/// nodes after merging back from a Branch-on-Mask.
- VPPredInstPHIRecipe(VPValue *PredV)
- : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV) {}
+ VPPredInstPHIRecipe(VPValue *PredV, DebugLoc DL)
+ : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV, DL) {}
~VPPredInstPHIRecipe() override = default;
VPPredInstPHIRecipe *clone() override {
- return new VPPredInstPHIRecipe(getOperand(0));
+ return new VPPredInstPHIRecipe(getOperand(0), getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC)
@@ -3203,11 +3207,6 @@ public:
return true;
}
- /// Check if the induction described by \p Kind, /p Start and \p Step is
- /// canonical, i.e. has the same start and step (of 1) as the canonical IV.
- bool isCanonical(InductionDescriptor::InductionKind Kind, VPValue *Start,
- VPValue *Step) const;
-
/// Return the cost of this VPCanonicalIVPHIRecipe.
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override {
@@ -3551,8 +3550,6 @@ public:
return make_range(begin(), getFirstNonPhi());
}
- void dropAllReferences(VPValue *NewValue) override;
-
/// Split current block at \p SplitAt by inserting a new block between the
/// current block and its successors and moving all recipes starting at
/// SplitAt to the new block. Returns the new block.
@@ -3582,12 +3579,7 @@ public:
/// Clone the current block and it's recipes, without updating the operands of
/// the cloned recipes.
- VPBasicBlock *clone() override {
- auto *NewBlock = new VPBasicBlock(getName());
- for (VPRecipeBase &R : *this)
- NewBlock->appendRecipe(R.clone());
- return NewBlock;
- }
+ VPBasicBlock *clone() override;
protected:
/// Execute the recipes in the IR basic block \p BB.
@@ -3623,20 +3615,11 @@ public:
return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
}
- /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
- /// instructions in \p IRBB, except its terminator which is managed in VPlan.
- static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB);
-
/// The method which generates the output IR instructions that correspond to
/// this VPBasicBlock, thereby "executing" the VPlan.
void execute(VPTransformState *State) override;
- VPIRBasicBlock *clone() override {
- auto *NewBlock = new VPIRBasicBlock(IRBB);
- for (VPRecipeBase &R : Recipes)
- NewBlock->appendRecipe(R.clone());
- return NewBlock;
- }
+ VPIRBasicBlock *clone() override;
BasicBlock *getIRBasicBlock() const { return IRBB; }
};
@@ -3675,13 +3658,7 @@ public:
: VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
IsReplicator(IsReplicator) {}
- ~VPRegionBlock() override {
- if (Entry) {
- VPValue DummyValue;
- Entry->dropAllReferences(&DummyValue);
- deleteCFG(Entry);
- }
- }
+ ~VPRegionBlock() override {}
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPBlockBase *V) {
@@ -3729,8 +3706,6 @@ public:
// Return the cost of this region.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
- void dropAllReferences(VPValue *NewValue) override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
/// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
@@ -3757,14 +3732,12 @@ class VPlan {
friend class VPlanPrinter;
friend class VPSlotTracker;
- /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the
- /// preheader of the vector loop.
- VPBasicBlock *Entry;
-
/// VPBasicBlock corresponding to the original preheader. Used to place
/// VPExpandSCEV recipes for expressions used during skeleton creation and the
/// rest of VPlan execution.
- VPBasicBlock *Preheader;
+ /// When this VPlan is used for the epilogue vector loop, the entry will be
+ /// replaced by a new entry block created during skeleton creation.
+ VPBasicBlock *Entry;
/// VPIRBasicBlock wrapping the header of the original scalar loop.
VPIRBasicBlock *ScalarHeader;
@@ -3809,46 +3782,50 @@ class VPlan {
/// been modeled in VPlan directly.
DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
-public:
- /// Construct a VPlan with original preheader \p Preheader, trip count \p TC,
- /// \p Entry to the plan and with \p ScalarHeader wrapping the original header
- /// of the scalar loop. At the moment, \p Preheader and \p Entry need to be
- /// disconnected, as the bypass blocks between them are not yet modeled in
- /// VPlan.
- VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry,
- VPIRBasicBlock *ScalarHeader)
- : VPlan(Preheader, Entry, ScalarHeader) {
- TripCount = TC;
- }
+ /// Blocks allocated and owned by the VPlan. They will be deleted once the
+ /// VPlan is destroyed.
+ SmallVector<VPBlockBase *> CreatedBlocks;
- /// Construct a VPlan with original preheader \p Preheader, \p Entry to
- /// the plan and with \p ScalarHeader wrapping the original header of the
- /// scalar loop. At the moment, \p Preheader and \p Entry need to be
- /// disconnected, as the bypass blocks between them are not yet modeled in
- /// VPlan.
- VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry,
- VPIRBasicBlock *ScalarHeader)
- : Entry(Entry), Preheader(Preheader), ScalarHeader(ScalarHeader) {
+ /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
+ /// wrapping the original header of the scalar loop.
+ VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
+ : Entry(Entry), ScalarHeader(ScalarHeader) {
Entry->setPlan(this);
- Preheader->setPlan(this);
- assert(Preheader->getNumSuccessors() == 0 &&
- Preheader->getNumPredecessors() == 0 &&
- "preheader must be disconnected");
assert(ScalarHeader->getNumSuccessors() == 0 &&
"scalar header must be a leaf node");
}
+public:
+ /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the
+ /// original preheader and scalar header of \p L, to be used as entry and
+ /// scalar header blocks of the new VPlan.
+ VPlan(Loop *L);
+
+ /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock
+ /// wrapping \p ScalarHeaderBB and a trip count of \p TC.
+ VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) {
+ setEntry(createVPBasicBlock("preheader"));
+ ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB);
+ TripCount = TC;
+ }
+
~VPlan();
+ void setEntry(VPBasicBlock *VPBB) {
+ Entry = VPBB;
+ VPBB->setPlan(this);
+ }
+
/// Create initial VPlan, having an "entry" VPBasicBlock (wrapping
- /// original scalar pre-header ) which contains SCEV expansions that need
- /// to happen before the CFG is modified; a VPBasicBlock for the vector
- /// pre-header, followed by a region for the vector loop, followed by the
- /// middle VPBasicBlock. If a check is needed to guard executing the scalar
- /// epilogue loop, it will be added to the middle block, together with
- /// VPBasicBlocks for the scalar preheader and exit blocks.
- /// \p InductionTy is the type of the canonical induction and used for related
- /// values, like the trip count expression.
+ /// original scalar pre-header) which contains SCEV expansions that need
+ /// to happen before the CFG is modified (when executing a VPlan for the
+ /// epilogue vector loop, the original entry needs to be replaced by a new
+ /// one); a VPBasicBlock for the vector pre-header, followed by a region for
+ /// the vector loop, followed by the middle VPBasicBlock. If a check is needed
+ /// to guard executing the scalar epilogue loop, it will be added to the
+ /// middle block, together with VPBasicBlocks for the scalar preheader and
+ /// exit blocks. \p InductionTy is the type of the canonical induction and
+ /// used for related values, like the trip count expression.
static VPlanPtr createInitialVPlan(Type *InductionTy,
PredicatedScalarEvolution &PSE,
bool RequiresScalarEpilogueCheck,
@@ -3856,7 +3833,7 @@ public:
/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
- Value *CanonicalIVStartValue, VPTransformState &State);
+ VPTransformState &State);
/// Generate the IR code for this VPlan.
void execute(VPTransformState *State);
@@ -3873,26 +3850,22 @@ public:
}
/// Returns the VPRegionBlock of the vector loop.
- VPRegionBlock *getVectorLoopRegion() {
- return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
- }
- const VPRegionBlock *getVectorLoopRegion() const {
- return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
- }
+ VPRegionBlock *getVectorLoopRegion();
+ const VPRegionBlock *getVectorLoopRegion() const;
/// Returns the 'middle' block of the plan, that is the block that selects
/// whether to execute the scalar tail loop or the exit block from the loop
/// latch.
const VPBasicBlock *getMiddleBlock() const {
- return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front());
}
VPBasicBlock *getMiddleBlock() {
- return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front());
}
/// Return the VPBasicBlock for the preheader of the scalar loop.
VPBasicBlock *getScalarPreheader() const {
- return cast<VPBasicBlock>(ScalarHeader->getSinglePredecessor());
+ return cast<VPBasicBlock>(getScalarHeader()->getSinglePredecessor());
}
/// Return the VPIRBasicBlock wrapping the header of the scalar loop.
@@ -4027,13 +4000,52 @@ public:
SCEVToExpansion[S] = V;
}
- /// \return The block corresponding to the original preheader.
- VPBasicBlock *getPreheader() { return Preheader; }
- const VPBasicBlock *getPreheader() const { return Preheader; }
-
/// Clone the current VPlan, update all VPValues of the new VPlan and cloned
/// recipes to refer to the clones, and return it.
VPlan *duplicate();
+
+ /// Create a new VPBasicBlock with \p Name and containing \p Recipe if
+ /// present. The returned block is owned by the VPlan and deleted once the
+ /// VPlan is destroyed.
+ VPBasicBlock *createVPBasicBlock(const Twine &Name,
+ VPRecipeBase *Recipe = nullptr) {
+ auto *VPB = new VPBasicBlock(Name, Recipe);
+ CreatedBlocks.push_back(VPB);
+ return VPB;
+ }
+
+ /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
+ /// IsReplicator is true, the region is a replicate region. The returned block
+ /// is owned by the VPlan and deleted once the VPlan is destroyed.
+ VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
+ const std::string &Name = "",
+ bool IsReplicator = false) {
+ auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
+ CreatedBlocks.push_back(VPB);
+ return VPB;
+ }
+
+ /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set
+ /// to nullptr. If \p IsReplicator is true, the region is a replicate region.
+ /// The returned block is owned by the VPlan and deleted once the VPlan is
+ /// destroyed.
+ VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
+ bool IsReplicator = false) {
+ auto *VPB = new VPRegionBlock(Name, IsReplicator);
+ CreatedBlocks.push_back(VPB);
+ return VPB;
+ }
+
+ /// Create a VPIRBasicBlock wrapping \p IRBB, but do not create
+ /// VPIRInstructions wrapping the instructions in t\p IRBB. The returned
+ /// block is owned by the VPlan and deleted once the VPlan is destroyed.
+ VPIRBasicBlock *createEmptyVPIRBasicBlock(BasicBlock *IRBB);
+
+ /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
+ /// instructions in \p IRBB, except its terminator which is managed by the
+ /// successors of the block in VPlan. The returned block is owned by the VPlan
+ /// and deleted once the VPlan is destroyed.
+ VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB);
};
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -4179,8 +4191,6 @@ public:
"Can't connect two block with different parents");
assert((SuccIdx != -1u || From->getNumSuccessors() < 2) &&
"Blocks can't have more than two successors.");
- assert((PredIdx != -1u || To->getNumPredecessors() < 2) &&
- "Blocks can't have more than two predecessors.");
if (SuccIdx == -1u)
From->appendSuccessor(To);
else
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 969d07b229e4..35497a7431f7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -61,10 +61,16 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case Instruction::ICmp:
case VPInstruction::ActiveLaneMask:
return inferScalarType(R->getOperand(1));
+ case VPInstruction::ComputeReductionResult: {
+ auto *PhiR = cast<VPReductionPHIRecipe>(R->getOperand(0));
+ auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ return OrigPhi->getType();
+ }
case VPInstruction::ExplicitVectorLength:
return Type::getIntNTy(Ctx, 32);
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::Not:
+ case VPInstruction::ResumePhi:
return SetResultTyFromOp();
case VPInstruction::ExtractFromEnd: {
Type *BaseTy = inferScalarType(R->getOperand(0));
@@ -127,7 +133,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenLoadEVLRecipe>(R)) &&
+ assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
"Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 6e633739fcc3..76ed578424df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -182,7 +182,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
// Create new VPBB.
StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();
LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
- VPBasicBlock *VPBB = new VPBasicBlock(Name);
+ VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name);
BB2VPBB[BB] = VPBB;
// Get or create a region for the loop containing BB.
@@ -204,7 +204,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
if (LoopOfBB == TheLoop) {
RegionOfVPBB = Plan.getVectorLoopRegion();
} else {
- RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/);
+ RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/);
RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]);
}
RegionOfVPBB->setEntry(VPBB);
@@ -357,12 +357,10 @@ void PlainCFGBuilder::buildPlainCFG() {
BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB;
VectorHeaderVPBB->clearSuccessors();
VectorLatchVPBB->clearPredecessors();
- if (TheLoop->getHeader() != TheLoop->getLoopLatch()) {
+ if (TheLoop->getHeader() != TheLoop->getLoopLatch())
BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB;
- } else {
+ else
TheRegion->setExiting(VectorHeaderVPBB);
- delete VectorLatchVPBB;
- }
// 1. Scan the body of the loop in a topological order to visit each basic
// block after having visited its predecessor basic blocks. Create a VPBB for
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index 9e8f9f3f4002..ad6e2ad90a96 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -32,11 +32,11 @@ class Loop;
class LoopInfo;
class VPRegionBlock;
class VPlan;
-class VPlanTestBase;
+class VPlanTestIRBase;
/// Main class to build the VPlan H-CFG for an incoming IR.
class VPlanHCFGBuilder {
- friend VPlanTestBase;
+ friend VPlanTestIRBase;
private:
// The outermost loop of the input loop nest considered for vectorization.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 18e5e2996c82..ec3c203a61b3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -78,6 +78,8 @@ template <unsigned BitWidth = 0> struct specific_intval {
if (!VPV->isLiveIn())
return false;
Value *V = VPV->getLiveInIRValue();
+ if (!V)
+ return false;
const auto *CI = dyn_cast<ConstantInt>(V);
if (!CI && V->getType()->isVectorTy())
if (const auto *C = dyn_cast<Constant>(V))
@@ -136,7 +138,8 @@ struct MatchRecipeAndOpcode<Opcode, RecipeTy> {
// Check for recipes that do not have opcodes.
if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
- std::is_same<RecipeTy, VPWidenSelectRecipe>::value)
+ std::is_same<RecipeTy, VPWidenSelectRecipe>::value ||
+ std::is_same<RecipeTy, VPDerivedIVRecipe>::value)
return DefR;
else
return DefR && DefR->getOpcode() == Opcode;
@@ -382,6 +385,17 @@ inline VPScalarIVSteps_match<Op0_t, Op1_t> m_ScalarIVSteps(const Op0_t &Op0,
const Op1_t &Op1) {
return VPScalarIVSteps_match<Op0_t, Op1_t>(Op0, Op1);
}
+
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+using VPDerivedIV_match =
+ Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, 0, false, VPDerivedIVRecipe>;
+
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline VPDerivedIV_match<Op0_t, Op1_t, Op2_t>
+m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
+ return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2});
+}
+
} // namespace VPlanPatternMatch
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ef5f6e22f822..77c08839dbfa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -57,6 +57,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case Instruction::Or:
case Instruction::ICmp:
case Instruction::Select:
+ case VPInstruction::AnyOf:
case VPInstruction::Not:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
@@ -361,6 +362,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
+ case VPInstruction::AnyOf:
return true;
default:
return false;
@@ -565,6 +567,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
ReducedPartRdx = Builder.CreateBinOp(
(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
+ else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
+ ReducedPartRdx =
+ createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart);
else
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
}
@@ -573,7 +578,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
// Create the reduction after the loop. Note that inloop reductions create
// the target reduction in the loop using a Reduction recipe.
if ((State.VF.isVector() ||
- RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
+ RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) &&
!PhiR->isInLoop()) {
ReducedPartRdx =
createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
@@ -615,8 +621,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
"can only generate first lane for PtrAdd");
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *Addend = State.get(getOperand(1), VPLane(0));
- return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name)
- : Builder.CreatePtrAdd(Ptr, Addend, Name);
+ return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::ResumePhi: {
Value *IncomingFromVPlanPred =
@@ -624,18 +629,22 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *IncomingFromOtherPreds =
State.get(getOperand(1), /* IsScalar */ true);
auto *NewPhi =
- Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
+ Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name);
BasicBlock *VPlanPred =
State.CFG
- .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
+ .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])];
NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
- assert(OtherPred != VPlanPred &&
- "VPlan predecessors should not be connected yet");
+ if (OtherPred == VPlanPred)
+ continue;
NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
}
return NewPhi;
}
+ case VPInstruction::AnyOf: {
+ Value *A = State.get(getOperand(0));
+ return Builder.CreateOrReduce(A);
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
@@ -644,7 +653,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
- getOpcode() == VPInstruction::ComputeReductionResult;
+ getOpcode() == VPInstruction::ComputeReductionResult ||
+ getOpcode() == VPInstruction::AnyOf;
}
bool VPInstruction::isSingleScalar() const {
@@ -707,6 +717,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
return false;
case Instruction::ICmp:
case Instruction::Select:
+ case Instruction::Or:
case VPInstruction::PtrAdd:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
@@ -802,6 +813,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::PtrAdd:
O << "ptradd";
break;
+ case VPInstruction::AnyOf:
+ O << "any-of";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -819,12 +833,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
void VPIRInstruction::execute(VPTransformState &State) {
assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
"Only PHINodes can have extra operands");
- if (getNumOperands() == 1) {
- VPValue *ExitValue = getOperand(0);
+ for (const auto &[Idx, Op] : enumerate(operands())) {
+ VPValue *ExitValue = Op;
auto Lane = vputils::isUniformAfterVectorization(ExitValue)
? VPLane::getFirstLane()
: VPLane::getLastLaneForVF(State.VF);
- auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
+ VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
+ auto *PredVPBB = Pred->getExitingBasicBlock();
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
// Set insertion point in PredBB in case an extract needs to be generated.
// TODO: Model extracts explicitly.
@@ -857,11 +872,13 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
O << Indent << "IR " << I;
if (getNumOperands() != 0) {
- assert(getNumOperands() == 1 && "can have at most 1 operand");
- O << " (extra operand: ";
- getOperand(0)->printAsOperand(O, SlotTracker);
- O << " from ";
- getParent()->getPredecessors()[0]->printAsOperand(O);
+ O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
+ interleaveComma(
+ enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
+ Op.value()->printAsOperand(O, SlotTracker);
+ O << " from ";
+ getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
+ });
O << ")";
}
}
@@ -950,7 +967,8 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
Value *Arg;
- if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
+ if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
+ State.TTI))
Arg = State.get(I.value(), VPLane(0));
else
Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
@@ -964,7 +982,8 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
Module *M = State.Builder.GetInsertBlock()->getModule();
Function *VectorF =
Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
- assert(VectorF && "Can't retrieve vector intrinsic.");
+ assert(VectorF &&
+ "Can't retrieve vector intrinsic or vector-predication intrinsics.");
auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
SmallVector<OperandBundleDef, 1> OpBundles;
@@ -1012,11 +1031,11 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
Arguments.push_back(V);
}
- Type *RetTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
SmallVector<Type *> ParamTys;
for (unsigned I = 0; I != getNumOperands(); ++I)
ParamTys.push_back(
- ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
+ toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
@@ -1184,7 +1203,7 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
Type *ScalarTy = Ctx.Types.inferScalarType(this);
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
VPValue *Op0, *Op1;
@@ -1254,8 +1273,12 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
getFastMathFlags().print(O);
break;
case OperationType::GEPOp:
- if (GEPFlags.IsInBounds)
+ if (GEPFlags.isInBounds())
O << " inbounds";
+ else if (GEPFlags.hasNoUnsignedSignedWrap())
+ O << " nusw";
+ if (GEPFlags.hasNoUnsignedWrap())
+ O << " nuw";
break;
case OperationType::NonNegOp:
if (NonNegFlags.NonNeg)
@@ -1361,7 +1384,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
switch (Opcode) {
case Instruction::FNeg: {
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticInstrCost(
Opcode, VectorTy, CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
@@ -1399,7 +1422,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
getOperand(1)->isDefinedOutsideLoopRegions())
RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
SmallVector<const Value *, 4> Operands;
@@ -1412,13 +1435,13 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
}
case Instruction::Freeze: {
// This opcode is unknown. Assume that it is the same as 'mul'.
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
@@ -1546,8 +1569,8 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
}
auto *SrcTy =
- cast<VectorType>(ToVectorTy(Ctx.Types.inferScalarType(Operand), VF));
- auto *DestTy = cast<VectorType>(ToVectorTy(getResultType(), VF));
+ cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
+ auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
// Arm TTI will use the underlying instruction to determine the cost.
return Ctx.TTI.getCastInstrCost(
Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
@@ -1559,7 +1582,7 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-CAST ";
printAsOperand(O, SlotTracker);
- O << " = " << Instruction::getOpcodeName(Opcode) << " ";
+ O << " = " << Instruction::getOpcodeName(Opcode);
printFlags(O);
printOperands(O, SlotTracker);
O << " to " << *getResultType();
@@ -1572,10 +1595,10 @@ InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
}
/// This function adds
-/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
-/// to each vector element of Val. The sequence starts at StartIndex.
+/// (0 * Step, 1 * Step, 2 * Step, ...)
+/// to each vector element of Val.
/// \p Opcode is relevant for FP induction variable.
-static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
+static Value *getStepVector(Value *Val, Value *Step,
Instruction::BinaryOps BinOp, ElementCount VF,
IRBuilderBase &Builder) {
assert(VF.isVector() && "only vector VFs are supported");
@@ -1600,11 +1623,7 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
}
Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
- // Splat the StartIdx
- Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
-
if (STy->isIntegerTy()) {
- InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
// FIXME: The newly created binary instructions should contain nsw/nuw
@@ -1617,7 +1636,6 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction");
InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
- InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
Step = Builder.CreateVectorSplat(VLen, Step);
Value *MulOp = Builder.CreateFMul(InitVec, Step);
@@ -1638,12 +1656,13 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
const InductionDescriptor &ID = getInductionDescriptor();
TruncInst *Trunc = getTruncInst();
IRBuilderBase &Builder = State.Builder;
- assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+ assert(getPHINode()->getType() == ID.getStartValue()->getType() &&
+ "Types must match");
assert(State.VF.isVector() && "must have vector VF");
// The value from the original loop to which we are mapping the new induction
// variable.
- Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode();
// Fast-math-flags propagate from the original induction instruction.
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
@@ -1668,10 +1687,9 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
}
- Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
- Value *SteppedStart = getStepVector(
- SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
+ Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(),
+ State.VF, State.Builder);
// We create vector phi nodes for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
@@ -1711,14 +1729,14 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
// factor. The last of those goes into the PHI.
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
- VecInd->setDebugLoc(EntryVal->getDebugLoc());
+ VecInd->setDebugLoc(getDebugLoc());
State.set(this, VecInd);
Instruction *LastInduction = cast<Instruction>(
Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
if (isa<TruncInst>(EntryVal))
State.addMetadata(LastInduction, EntryVal);
- LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+ LastInduction->setDebugLoc(getDebugLoc());
VecInd->addIncoming(SteppedStart, VectorPH);
// Add induction update using an incorrect block temporarily. The phi node
@@ -1732,20 +1750,13 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN-INDUCTION";
- if (getTruncInst()) {
- O << "\\l\"";
- O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
- O << " +\n" << Indent << "\" ";
- getVPValue(0)->printAsOperand(O, SlotTracker);
- } else
- O << " " << VPlanIngredient(IV);
-
- O << ", ";
- getStepValue()->printAsOperand(O, SlotTracker);
+ O << Indent;
+ printAsOperand(O, SlotTracker);
+ O << " = WIDEN-INDUCTION ";
+ printOperands(O, SlotTracker);
- O << ", ";
- getVFValue()->printAsOperand(O, SlotTracker);
+ if (auto *TI = getTruncInst())
+ O << " (truncated to " << *TI->getType() << ")";
}
#endif
@@ -1896,9 +1907,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
for (unsigned I = 0, E = getNumOperands(); I != E; I++)
Ops.push_back(State.get(getOperand(I), VPLane(0)));
- auto *NewGEP =
- State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
- ArrayRef(Ops).drop_front(), "", isInBounds());
+ auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
+ ArrayRef(Ops).drop_front(), "",
+ getGEPNoWrapFlags());
Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
State.set(this, Splat);
State.addMetadata(Splat, GEP);
@@ -1924,7 +1935,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
// but it should be a vector, otherwise.
auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
- Indices, "", isInBounds());
+ Indices, "", getGEPNoWrapFlags());
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector");
State.set(this, NewGEP);
@@ -1975,9 +1986,10 @@ void VPReverseVectorPointerRecipe::execute(VPTransformState &State) {
// LastLane = 1 - RunTimeVF
Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
Value *Ptr = State.get(getOperand(0), VPLane(0));
- bool InBounds = isInBounds();
- Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
- ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", InBounds);
+ Value *ResultPtr =
+ Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
+ ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
+ getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
@@ -1987,9 +1999,8 @@ void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
- O << " = reverse-vector-pointer ";
- if (isInBounds())
- O << "inbounds ";
+ O << " = reverse-vector-pointer";
+ printFlags(O);
printOperands(O, SlotTracker);
}
#endif
@@ -2001,10 +2012,10 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
- bool InBounds = isInBounds();
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
- Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
+ Value *ResultPtr =
+ Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
@@ -2066,8 +2077,8 @@ InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
if (vputils::onlyFirstLaneUsed(this))
return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
- Type *ResultTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
- Type *CmpTy = ToVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
+ Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
return (getNumIncomingValues() - 1) *
Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
CmpInst::BAD_ICMP_PREDICATE, CostKind);
@@ -2104,6 +2115,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
// Propagate the fast-math flags carried by the underlying instruction.
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+ State.setDebugLocFrom(getDebugLoc());
Value *NewVecOp = State.get(getVecOp());
if (VPValue *Cond = getCondOp()) {
Value *NewCond = State.get(Cond, State.VF.isScalar());
@@ -2188,7 +2200,7 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
RecurKind RdxKind = RdxDesc.getRecurrenceKind();
Type *ElementTy = Ctx.Types.inferScalarType(this);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ElementTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
unsigned Opcode = RdxDesc.getOpcode();
@@ -2380,6 +2392,7 @@ InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
}
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFrom(getDebugLoc());
assert(State.Lane && "Predicated instruction PHI works per instance.");
Instruction *ScalarPredInst =
cast<Instruction>(State.get(getOperand(0), *State.Lane));
@@ -2439,7 +2452,7 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
const Align Alignment =
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
@@ -2586,7 +2599,7 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
// legacy model, it will always calculate the cost of mask.
// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
// don't need to compare to the legacy cost model.
- Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
const Align Alignment =
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
@@ -2707,7 +2720,7 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
// legacy model, it will always calculate the cost of mask.
// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
// don't need to compare to the legacy cost model.
- Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
const Align Alignment =
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
@@ -3075,7 +3088,7 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
Type *ValTy = Ctx.Types.inferScalarType(
getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
: getStoredValues()[InsertPosIdx]);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -3111,31 +3124,14 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-bool VPCanonicalIVPHIRecipe::isCanonical(
- InductionDescriptor::InductionKind Kind, VPValue *Start,
- VPValue *Step) const {
- // Must be an integer induction.
- if (Kind != InductionDescriptor::IK_IntInduction)
- return false;
- // Start must match the start value of this canonical induction.
- if (Start != getStartValue())
- return false;
-
- // If the step is defined by a recipe, it is not a ConstantInt.
- if (Step->getDefiningRecipe())
- return false;
-
- ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
- return StepC && StepC->isOne();
-}
-
bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
return IsScalarAfterVectorization &&
(!IsScalable || vputils::onlyFirstLaneUsed(this));
}
void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
- assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
+ assert(getInductionDescriptor().getKind() ==
+ InductionDescriptor::IK_PtrInduction &&
"Not a pointer induction according to InductionDescriptor!");
assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
"Unexpected type.");
@@ -3160,6 +3156,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
CanonicalIV->getIterator());
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+ NewPointerPhi->setDebugLoc(getDebugLoc());
} else {
// The recipe has been unrolled. In that case, fetch the single pointer phi
// shared among all unrolled parts of the recipe.
@@ -3170,8 +3167,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
// A pointer induction, performed by using a gep
BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
- Value *ScalarStepValue = State.get(getOperand(1), VPLane(0));
- Type *PhiType = IndDesc.getStep()->getType();
+ Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
+ Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
// Add induction update using an incorrect block temporarily. The phi node
// will be fixed after VPlan execution. Note that at this point the latch
@@ -3223,7 +3220,8 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
printAsOperand(O, SlotTracker);
O << " = WIDEN-POINTER-INDUCTION ";
getStartValue()->printAsOperand(O, SlotTracker);
- O << ", " << *IndDesc.getStep();
+ O << ", ";
+ getStepValue()->printAsOperand(O, SlotTracker);
if (getNumOperands() == 4) {
O << ", ";
getOperand(2)->printAsOperand(O, SlotTracker);
@@ -3235,13 +3233,22 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
void VPExpandSCEVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "cannot be used in per-lane");
+ if (State.ExpandedSCEVs.contains(Expr)) {
+ // SCEV Expr has already been expanded, result must already be set. At the
+ // moment we have to execute the entry block twice (once before skeleton
+ // creation to get expanded SCEVs used by the skeleton and once during
+ // regular VPlan execution).
+ State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]);
+ assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] &&
+ "Results must match");
+ return;
+ }
+
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
SCEVExpander Exp(SE, DL, "induction");
Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
&*State.Builder.GetInsertPoint());
- assert(!State.ExpandedSCEVs.contains(Expr) &&
- "Same SCEV expanded multiple times");
State.ExpandedSCEVs[Expr] = Res;
State.set(this, Res, VPLane(0));
}
@@ -3324,7 +3331,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
SmallVector<int> Mask(VF.getKnownMinValue());
std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
Type *VectorTy =
- ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+ toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
cast<VectorType>(VectorTy), Mask, CostKind,
@@ -3358,7 +3365,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
: VectorType::get(StartV->getType(), State.VF);
BasicBlock *HeaderBB = State.CFG.PrevBB;
- assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
+ assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
"recipe must be in the vector loop header");
auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
Phi->insertBefore(HeaderBB->getFirstInsertionPt());
@@ -3380,6 +3387,22 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
Builder.SetInsertPoint(VectorPH->getTerminator());
StartV = Iden = State.get(StartVPV);
}
+ } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
+ // [I|F]FindLastIV will use a sentinel value to initialize the reduction
+ // phi or the resume value from the main vector loop when vectorizing the
+ // epilogue loop. In the exit block, ComputeReductionResult will generate
+ // checks to verify if the reduction result is the sentinel value. If the
+ // result is the sentinel value, it will be corrected back to the start
+ // value.
+ // TODO: The sentinel value is not always necessary. When the start value is
+ // a constant, and smaller than the start value of the induction variable,
+ // the start value can be directly used to initialize the reduction phi.
+ Iden = StartV;
+ if (!ScalarPHI) {
+ IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden);
+ }
} else {
Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
RdxDesc.getFastMathFlags());
@@ -3483,7 +3506,7 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
void VPScalarPHIRecipe::execute(VPTransformState &State) {
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
- Value *Start = State.get(getOperand(0), VPLane(0));
+ Value *Start = State.get(getStartValue(), VPLane(0));
PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name);
Phi->addIncoming(Start, VectorPH);
Phi->setDebugLoc(getDebugLoc());
@@ -3493,7 +3516,7 @@ void VPScalarPHIRecipe::execute(VPTransformState &State) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << Indent << "SCALAR-PHI";
+ O << Indent << "SCALAR-PHI ";
printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cee83d1015b5..8ac2bd5160c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -61,8 +61,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
- NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step,
- &Plan->getVF(), *II);
+ NewRecipe = new VPWidenIntOrFpInductionRecipe(
+ Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
} else {
assert(isa<VPInstruction>(&Ingredient) &&
"only VPInstructions expected here");
@@ -217,7 +217,7 @@ static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
// is connected to a successor replicate region with the same predicate by a
// single, empty VPBasicBlock.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
- SetVector<VPRegionBlock *> DeletedRegions;
+ SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
// Collect replicate regions followed by an empty block, followed by another
// replicate region with matching masks to process front. This is to avoid
@@ -248,7 +248,7 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
// Move recipes from Region1 to its successor region, if both are triangles.
for (VPRegionBlock *Region1 : WorkList) {
- if (DeletedRegions.contains(Region1))
+ if (TransformedRegions.contains(Region1))
continue;
auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
@@ -294,12 +294,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
}
VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
- DeletedRegions.insert(Region1);
+ TransformedRegions.insert(Region1);
}
- for (VPRegionBlock *ToDelete : DeletedRegions)
- delete ToDelete;
- return !DeletedRegions.empty();
+ return !TransformedRegions.empty();
}
static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
@@ -310,7 +308,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
assert(Instr->getParent() && "Predicated instruction not in any basic block");
auto *BlockInMask = PredRecipe->getMask();
auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
- auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+ auto *Entry =
+ Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
// Replace predicated replicate recipe with a replicate recipe without a
// mask but in the replicate region.
@@ -318,17 +317,21 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
PredRecipe->getUnderlyingInstr(),
make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
PredRecipe->isUniform());
- auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
+ auto *Pred =
+ Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
VPPredInstPHIRecipe *PHIRecipe = nullptr;
if (PredRecipe->getNumUsers() != 0) {
- PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask);
+ PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
+ RecipeWithoutMask->getDebugLoc());
PredRecipe->replaceAllUsesWith(PHIRecipe);
PHIRecipe->setOperand(0, RecipeWithoutMask);
}
PredRecipe->eraseFromParent();
- auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
- VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
+ auto *Exiting =
+ Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ VPRegionBlock *Region =
+ Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
@@ -377,7 +380,8 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
continue;
auto *PredVPBB =
dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
- if (!PredVPBB || PredVPBB->getNumSuccessors() != 1)
+ if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
+ isa<VPIRBasicBlock>(PredVPBB))
continue;
WorkList.push_back(VPBB);
}
@@ -394,7 +398,7 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
VPBlockUtils::disconnectBlocks(VPBB, Succ);
VPBlockUtils::connectBlocks(PredVPBB, Succ);
}
- delete VPBB;
+ // VPBB is now dead and will be cleaned up when the plan gets destroyed.
}
return !WorkList.empty();
}
@@ -526,11 +530,8 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
VPValue *StartV, VPValue *Step, VPBuilder &Builder) {
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
- VPSingleDefRecipe *BaseIV = CanonicalIV;
- if (!CanonicalIV->isCanonical(Kind, StartV, Step)) {
- BaseIV = Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step,
- "offset.idx");
- }
+ VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
+ Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
// Truncate base induction if needed.
Type *CanonicalIVType = CanonicalIV->getScalarType();
@@ -661,6 +662,151 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
}
}
+/// Try to simplify recipe \p R.
+static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
+ using namespace llvm::VPlanPatternMatch;
+
+ if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
+ // Try to remove redundant blend recipes.
+ SmallPtrSet<VPValue *, 4> UniqueValues;
+ if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
+ UniqueValues.insert(Blend->getIncomingValue(0));
+ for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
+ if (!match(Blend->getMask(I), m_False()))
+ UniqueValues.insert(Blend->getIncomingValue(I));
+
+ if (UniqueValues.size() == 1) {
+ Blend->replaceAllUsesWith(*UniqueValues.begin());
+ Blend->eraseFromParent();
+ return;
+ }
+
+ if (Blend->isNormalized())
+ return;
+
+ // Normalize the blend so its first incoming value is used as the initial
+ // value with the others blended into it.
+
+ unsigned StartIndex = 0;
+ for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+ // If a value's mask is used only by the blend then is can be deadcoded.
+ // TODO: Find the most expensive mask that can be deadcoded, or a mask
+ // that's used by multiple blends where it can be removed from them all.
+ VPValue *Mask = Blend->getMask(I);
+ if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
+ StartIndex = I;
+ break;
+ }
+ }
+
+ SmallVector<VPValue *, 4> OperandsWithMask;
+ OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
+
+ for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+ if (I == StartIndex)
+ continue;
+ OperandsWithMask.push_back(Blend->getIncomingValue(I));
+ OperandsWithMask.push_back(Blend->getMask(I));
+ }
+
+ auto *NewBlend = new VPBlendRecipe(
+ cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
+ NewBlend->insertBefore(&R);
+
+ VPValue *DeadMask = Blend->getMask(StartIndex);
+ Blend->replaceAllUsesWith(NewBlend);
+ Blend->eraseFromParent();
+ recursivelyDeleteDeadRecipes(DeadMask);
+ return;
+ }
+
+ VPValue *A;
+ if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
+ VPValue *Trunc = R.getVPSingleValue();
+ Type *TruncTy = TypeInfo.inferScalarType(Trunc);
+ Type *ATy = TypeInfo.inferScalarType(A);
+ if (TruncTy == ATy) {
+ Trunc->replaceAllUsesWith(A);
+ } else {
+ // Don't replace a scalarizing recipe with a widened cast.
+ if (isa<VPReplicateRecipe>(&R))
+ return;
+ if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+
+ unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
+ ? Instruction::SExt
+ : Instruction::ZExt;
+ auto *VPC =
+ new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+ if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
+ // UnderlyingExt has distinct return type, used to retain legacy cost.
+ VPC->setUnderlyingValue(UnderlyingExt);
+ }
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
+ auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ }
+ }
+#ifndef NDEBUG
+ // Verify that the cached type info is for both A and its users is still
+ // accurate by comparing it to freshly computed types.
+ VPTypeAnalysis TypeInfo2(
+ R.getParent()->getPlan()->getCanonicalIV()->getScalarType());
+ assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
+ for (VPUser *U : A->users()) {
+ auto *R = cast<VPRecipeBase>(U);
+ for (VPValue *VPV : R->definedValues())
+ assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
+ }
+#endif
+ }
+
+ // Simplify (X && Y) || (X && !Y) -> X.
+ // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
+ // && (Y || Z) and (X || !X) into true. This requires queuing newly created
+ // recipes to be visited during simplification.
+ VPValue *X, *Y, *X1, *Y1;
+ if (match(&R,
+ m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+ m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
+ X == X1 && Y == Y1) {
+ R.getVPSingleValue()->replaceAllUsesWith(X);
+ R.eraseFromParent();
+ return;
+ }
+
+ if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
+ return R.getVPSingleValue()->replaceAllUsesWith(A);
+
+ if (match(&R, m_Not(m_Not(m_VPValue(A)))))
+ return R.getVPSingleValue()->replaceAllUsesWith(A);
+
+ // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
+ if ((match(&R,
+ m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
+ match(&R,
+ m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
+ TypeInfo.inferScalarType(R.getOperand(1)) ==
+ TypeInfo.inferScalarType(R.getVPSingleValue()))
+ return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1));
+}
+
+/// Try to simplify the recipes in \p Plan
+static void simplifyRecipes(VPlan &Plan) {
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+ VPTypeAnalysis TypeInfo(CanonicalIVType);
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ simplifyRecipe(R, TypeInfo);
+ }
+ }
+}
+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
@@ -696,11 +842,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
VPInstruction::BranchOnCond,
{Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
- SmallVector<VPValue *> PossiblyDead(Term->operands());
Term->eraseFromParent();
- for (VPValue *Op : PossiblyDead)
- recursivelyDeleteDeadRecipes(Op);
ExitingVPBB->appendRecipe(BOC);
+
+ VPlanTransforms::removeDeadRecipes(Plan);
+
Plan.setVF(BestVF);
Plan.setUF(BestUF);
// TODO: Further simplifications are possible
@@ -941,126 +1087,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
}
}
-/// Try to simplify recipe \p R.
-static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
- using namespace llvm::VPlanPatternMatch;
-
- if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
- // Try to remove redundant blend recipes.
- SmallPtrSet<VPValue *, 4> UniqueValues;
- if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
- UniqueValues.insert(Blend->getIncomingValue(0));
- for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
- if (!match(Blend->getMask(I), m_False()))
- UniqueValues.insert(Blend->getIncomingValue(I));
-
- if (UniqueValues.size() == 1) {
- Blend->replaceAllUsesWith(*UniqueValues.begin());
- Blend->eraseFromParent();
- return;
- }
-
- if (Blend->isNormalized())
- return;
-
- // Normalize the blend so its first incoming value is used as the initial
- // value with the others blended into it.
-
- unsigned StartIndex = 0;
- for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
- // If a value's mask is used only by the blend then is can be deadcoded.
- // TODO: Find the most expensive mask that can be deadcoded, or a mask
- // that's used by multiple blends where it can be removed from them all.
- VPValue *Mask = Blend->getMask(I);
- if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
- StartIndex = I;
- break;
- }
- }
-
- SmallVector<VPValue *, 4> OperandsWithMask;
- OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
-
- for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
- if (I == StartIndex)
- continue;
- OperandsWithMask.push_back(Blend->getIncomingValue(I));
- OperandsWithMask.push_back(Blend->getMask(I));
- }
-
- auto *NewBlend = new VPBlendRecipe(
- cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
- NewBlend->insertBefore(&R);
-
- VPValue *DeadMask = Blend->getMask(StartIndex);
- Blend->replaceAllUsesWith(NewBlend);
- Blend->eraseFromParent();
- recursivelyDeleteDeadRecipes(DeadMask);
- return;
- }
-
- VPValue *A;
- if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
- VPValue *Trunc = R.getVPSingleValue();
- Type *TruncTy = TypeInfo.inferScalarType(Trunc);
- Type *ATy = TypeInfo.inferScalarType(A);
- if (TruncTy == ATy) {
- Trunc->replaceAllUsesWith(A);
- } else {
- // Don't replace a scalarizing recipe with a widened cast.
- if (isa<VPReplicateRecipe>(&R))
- return;
- if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
-
- unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
- ? Instruction::SExt
- : Instruction::ZExt;
- auto *VPC =
- new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
- if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
- // UnderlyingExt has distinct return type, used to retain legacy cost.
- VPC->setUnderlyingValue(UnderlyingExt);
- }
- VPC->insertBefore(&R);
- Trunc->replaceAllUsesWith(VPC);
- } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
- auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
- VPC->insertBefore(&R);
- Trunc->replaceAllUsesWith(VPC);
- }
- }
-#ifndef NDEBUG
- // Verify that the cached type info is for both A and its users is still
- // accurate by comparing it to freshly computed types.
- VPTypeAnalysis TypeInfo2(
- R.getParent()->getPlan()->getCanonicalIV()->getScalarType());
- assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
- for (VPUser *U : A->users()) {
- auto *R = cast<VPRecipeBase>(U);
- for (VPValue *VPV : R->definedValues())
- assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
- }
-#endif
- }
-
- // Simplify (X && Y) || (X && !Y) -> X.
- // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
- // && (Y || Z) and (X || !X) into true. This requires queuing newly created
- // recipes to be visited during simplification.
- VPValue *X, *Y, *X1, *Y1;
- if (match(&R,
- m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
- m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
- X == X1 && Y == Y1) {
- R.getVPSingleValue()->replaceAllUsesWith(X);
- R.eraseFromParent();
- return;
- }
-
- if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
- return R.getVPSingleValue()->replaceAllUsesWith(A);
-}
-
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
static void licm(VPlan &Plan) {
VPBasicBlock *Preheader = Plan.getVectorPreheader();
@@ -1095,19 +1121,6 @@ static void licm(VPlan &Plan) {
}
}
-/// Try to simplify the recipes in \p Plan.
-static void simplifyRecipes(VPlan &Plan) {
- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
- Plan.getEntry());
- Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
- VPTypeAnalysis TypeInfo(CanonicalIVType);
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
- for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- simplifyRecipe(R, TypeInfo);
- }
- }
-}
-
void VPlanTransforms::truncateToMinimalBitwidths(
VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
#ifndef NDEBUG
@@ -1247,11 +1260,11 @@ void VPlanTransforms::optimize(VPlan &Plan) {
simplifyRecipes(Plan);
legalizeAndOptimizeInductions(Plan);
+ removeRedundantExpandSCEVRecipes(Plan);
+ simplifyRecipes(Plan);
removeDeadRecipes(Plan);
createAndOptimizeReplicateRegions(Plan);
-
- removeRedundantExpandSCEVRecipes(Plan);
mergeBlocksIntoPredecessors(Plan);
licm(Plan);
}
@@ -1438,112 +1451,134 @@ void VPlanTransforms::addActiveLaneMask(
HeaderMask->replaceAllUsesWith(LaneMask);
}
+/// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
+/// nullptr if no EVL-based recipe could be created.
+/// \p HeaderMask Header Mask.
+/// \p CurRecipe Recipe to be transform.
+/// \p TypeInfo VPlan-based type analysis.
+/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
+/// \p EVL The explicit vector length parameter of vector-predication
+/// intrinsics.
+static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
+ VPRecipeBase &CurRecipe,
+ VPTypeAnalysis &TypeInfo,
+ VPValue &AllOneMask, VPValue &EVL) {
+ using namespace llvm::VPlanPatternMatch;
+ auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
+ assert(OrigMask && "Unmasked recipe when folding tail");
+ return HeaderMask == OrigMask ? nullptr : OrigMask;
+ };
+
+ return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
+ .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
+ VPValue *NewMask = GetNewMask(L->getMask());
+ return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
+ })
+ .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
+ VPValue *NewMask = GetNewMask(S->getMask());
+ return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
+ })
+ .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * {
+ unsigned Opcode = W->getOpcode();
+ if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
+ return nullptr;
+ return new VPWidenEVLRecipe(*W, EVL);
+ })
+ .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
+ VPValue *NewMask = GetNewMask(Red->getCondOp());
+ return new VPReductionEVLRecipe(*Red, EVL, NewMask);
+ })
+ .Case<VPWidenIntrinsicRecipe, VPWidenCastRecipe>(
+ [&](auto *CR) -> VPRecipeBase * {
+ Intrinsic::ID VPID;
+ if (auto *CallR = dyn_cast<VPWidenIntrinsicRecipe>(CR)) {
+ VPID =
+ VPIntrinsic::getForIntrinsic(CallR->getVectorIntrinsicID());
+ } else {
+ auto *CastR = cast<VPWidenCastRecipe>(CR);
+ VPID = VPIntrinsic::getForOpcode(CastR->getOpcode());
+ }
+ assert(VPID != Intrinsic::not_intrinsic && "Expected VP intrinsic");
+ assert(VPIntrinsic::getMaskParamPos(VPID) &&
+ VPIntrinsic::getVectorLengthParamPos(VPID) &&
+ "Expected VP intrinsic");
+
+ SmallVector<VPValue *> Ops(CR->operands());
+ Ops.push_back(&AllOneMask);
+ Ops.push_back(&EVL);
+ return new VPWidenIntrinsicRecipe(
+ VPID, Ops, TypeInfo.inferScalarType(CR), CR->getDebugLoc());
+ })
+ .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
+ SmallVector<VPValue *> Ops(Sel->operands());
+ Ops.push_back(&EVL);
+ return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
+ TypeInfo.inferScalarType(Sel),
+ Sel->getDebugLoc());
+ })
+ .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
+ VPValue *LHS, *RHS;
+ // Transform select with a header mask condition
+ // select(header_mask, LHS, RHS)
+ // into vector predication merge.
+ // vp.merge(all-true, LHS, RHS, EVL)
+ if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
+ m_VPValue(RHS))))
+ return nullptr;
+ // Use all true as the condition because this transformation is
+ // limited to selects whose condition is a header mask.
+ return new VPWidenIntrinsicRecipe(
+ Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
+ TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
+ })
+ .Default([&](VPRecipeBase *R) { return nullptr; });
+}
+
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
- using namespace llvm::VPlanPatternMatch;
Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
VPTypeAnalysis TypeInfo(CanonicalIVType);
LLVMContext &Ctx = CanonicalIVType->getContext();
- SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+ VPValue *AllOneMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
for (VPUser *U : Plan.getVF().users()) {
if (auto *R = dyn_cast<VPReverseVectorPointerRecipe>(U))
R->setOperand(1, &EVL);
}
+ SmallVector<VPRecipeBase *> ToErase;
+
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
auto *CurRecipe = cast<VPRecipeBase>(U);
- auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
- assert(OrigMask && "Unmasked recipe when folding tail");
- return HeaderMask == OrigMask ? nullptr : OrigMask;
- };
-
- VPRecipeBase *NewRecipe =
- TypeSwitch<VPRecipeBase *, VPRecipeBase *>(CurRecipe)
- .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
- VPValue *NewMask = GetNewMask(L->getMask());
- return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
- })
- .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
- VPValue *NewMask = GetNewMask(S->getMask());
- return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
- })
- .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * {
- unsigned Opcode = W->getOpcode();
- if (!Instruction::isBinaryOp(Opcode) &&
- !Instruction::isUnaryOp(Opcode))
- return nullptr;
- return new VPWidenEVLRecipe(*W, EVL);
- })
- .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
- VPValue *NewMask = GetNewMask(Red->getCondOp());
- return new VPReductionEVLRecipe(*Red, EVL, NewMask);
- })
- .Case<VPWidenIntrinsicRecipe>(
- [&](VPWidenIntrinsicRecipe *CInst) -> VPRecipeBase * {
- auto *CI = cast<CallInst>(CInst->getUnderlyingInstr());
- Intrinsic::ID VPID = VPIntrinsic::getForIntrinsic(
- CI->getCalledFunction()->getIntrinsicID());
- if (VPID == Intrinsic::not_intrinsic)
- return nullptr;
-
- SmallVector<VPValue *> Ops(CInst->operands());
- assert(VPIntrinsic::getMaskParamPos(VPID) &&
- VPIntrinsic::getVectorLengthParamPos(VPID) &&
- "Expected VP intrinsic");
- VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(
- IntegerType::getInt1Ty(CI->getContext())));
- Ops.push_back(Mask);
- Ops.push_back(&EVL);
- return new VPWidenIntrinsicRecipe(
- *CI, VPID, Ops, TypeInfo.inferScalarType(CInst),
- CInst->getDebugLoc());
- })
- .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
- SmallVector<VPValue *> Ops(Sel->operands());
- Ops.push_back(&EVL);
- return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
- TypeInfo.inferScalarType(Sel),
- Sel->getDebugLoc());
- })
- .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
- VPValue *LHS, *RHS;
- // Transform select with a header mask condition
- // select(header_mask, LHS, RHS)
- // into vector predication merge.
- // vp.merge(all-true, LHS, RHS, EVL)
- if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
- m_VPValue(RHS))))
- return nullptr;
- // Use all true as the condition because this transformation is
- // limited to selects whose condition is a header mask.
- VPValue *AllTrue =
- Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
- return new VPWidenIntrinsicRecipe(
- Intrinsic::vp_merge, {AllTrue, LHS, RHS, &EVL},
- TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
- })
- .Default([&](VPRecipeBase *R) { return nullptr; });
-
- if (!NewRecipe)
+ VPRecipeBase *EVLRecipe =
+ createEVLRecipe(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
+ if (!EVLRecipe)
continue;
- [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues();
+ [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
"New recipe must define the same number of values as the "
"original.");
assert(
NumDefVal <= 1 &&
"Only supports recipes with a single definition or without users.");
- NewRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(NewRecipe)) {
+ EVLRecipe->insertBefore(CurRecipe);
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
VPValue *CurVPV = CurRecipe->getVPSingleValue();
- CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
+ CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
}
- CurRecipe->eraseFromParent();
+ // Defer erasing recipes till the end so that we don't invalidate the
+ // VPTypeAnalysis cache.
+ ToErase.push_back(CurRecipe);
}
- recursivelyDeleteDeadRecipes(HeaderMask);
+ }
+
+ for (VPRecipeBase *R : reverse(ToErase)) {
+ SmallVector<VPValue *> PossiblyDead(R->operands());
+ R->eraseFromParent();
+ for (VPValue *Op : PossiblyDead)
+ recursivelyDeleteDeadRecipes(Op);
}
}
@@ -1667,8 +1702,8 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
// instruction. Widen memory instructions involved in address computation
// will lead to gather/scatter instructions, which don't need to be
// handled.
- if (isa<VPWidenMemoryRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) ||
- isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec))
+ if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
+ VPHeaderPHIRecipe>(CurRec))
continue;
// This recipe contributes to the address computation of a widen
@@ -1820,9 +1855,7 @@ void VPlanTransforms::createInterleaveGroups(
}
}
-void VPlanTransforms::prepareToExecute(VPlan &Plan) {
- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
- Plan.getVectorLoopRegion());
+void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) {
@@ -1840,3 +1873,62 @@ void VPlanTransforms::prepareToExecute(VPlan &Plan) {
}
}
}
+
+void VPlanTransforms::handleUncountableEarlyExit(
+ VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
+ BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) {
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ auto *LatchVPBB = cast<VPBasicBlock>(LoopRegion->getExiting());
+ VPBuilder Builder(LatchVPBB->getTerminator());
+ auto *MiddleVPBB = Plan.getMiddleBlock();
+ VPValue *IsEarlyExitTaken = nullptr;
+
+ // Process the uncountable exiting block. Update IsEarlyExitTaken, which
+ // tracks if the uncountable early exit has been taken. Also split the middle
+ // block and have it conditionally branch to the early exit block if
+ // EarlyExitTaken.
+ auto *EarlyExitingBranch =
+ cast<BranchInst>(UncountableExitingBlock->getTerminator());
+ BasicBlock *TrueSucc = EarlyExitingBranch->getSuccessor(0);
+ BasicBlock *FalseSucc = EarlyExitingBranch->getSuccessor(1);
+
+ // The early exit block may or may not be the same as the "countable" exit
+ // block. Creates a new VPIRBB for the early exit block in case it is distinct
+ // from the countable exit block.
+ // TODO: Introduce both exit blocks during VPlan skeleton construction.
+ VPIRBasicBlock *VPEarlyExitBlock;
+ if (OrigLoop->getUniqueExitBlock()) {
+ VPEarlyExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+ } else {
+ VPEarlyExitBlock = Plan.createVPIRBasicBlock(
+ !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+ }
+
+ VPValue *EarlyExitNotTakenCond = RecipeBuilder.getBlockInMask(
+ OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+ auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond);
+ IsEarlyExitTaken =
+ Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
+
+ VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
+ VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
+ VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
+ NewMiddle->swapSuccessors();
+
+ VPBuilder MiddleBuilder(NewMiddle);
+ MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
+
+ // Replace the condition controlling the non-early exit from the vector loop
+ // with one exiting if either the original condition of the vector latch is
+ // true or the early exit has been taken.
+ auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
+ assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
+ "Unexpected terminator");
+ auto *IsLatchExitTaken =
+ Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
+ LatchExitingBranch->getOperand(1));
+ auto *AnyExitTaken = Builder.createNaryOp(
+ Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
+ Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
+ LatchExitingBranch->eraseFromParent();
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 1491e0a8df04..fddde8689116 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -124,8 +124,19 @@ struct VPlanTransforms {
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
+ /// Update \p Plan to account for the uncountable early exit block in \p
+ /// UncountableExitingBlock by
+ /// * updating the condition exiting the vector loop to include the early
+ /// exit conditions
+ /// * splitting the original middle block to branch to the early exit block
+ /// if taken.
+ static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
+ Loop *OrigLoop,
+ BasicBlock *UncountableExitingBlock,
+ VPRecipeBuilder &RecipeBuilder);
+
/// Lower abstract recipes to concrete ones, that can be codegen'd.
- static void prepareToExecute(VPlan &Plan);
+ static void convertToConcreteRecipes(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index f653269713b3..89e372d6b46c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -168,7 +168,7 @@ void UnrollState::unrollWidenInductionByUF(
auto *ConstStep = ScalarStep->isLiveIn()
? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
: nullptr;
- if (!ConstStep || ConstStep->getZExtValue() != 1) {
+ if (!ConstStep || ConstStep->getValue() != 1) {
if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
ScalarStep =
Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
@@ -412,8 +412,6 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
UnrollState Unroller(Plan, UF, Ctx);
- Unroller.unrollBlock(Plan.getPreheader());
-
// Iterate over all blocks in the plan starting from Entry, and unroll
// recipes inside them. This includes the vector preheader and middle blocks,
// which may set up or post-process per-part values.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4621c28b0512..e40af3e2e3d3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -34,7 +34,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
Expanded = Plan.getOrAddLiveIn(E->getValue());
else {
Expanded = new VPExpandSCEVRecipe(Expr, SE);
- Plan.getPreheader()->appendRecipe(Expanded->getDefiningRecipe());
+ Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());
}
Plan.addSCEVExpansion(Expr, Expanded);
return Expanded;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 71c7d547ac7d..be420a873bef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -185,7 +185,7 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
RecipeNumbering[&R] = Cnt++;
for (const VPRecipeBase &R : *VPBB) {
- if (isa<VPIRInstruction>(&R) ^ isa<VPIRBasicBlock>(VPBB)) {
+ if (isa<VPIRInstruction>(&R) && !isa<VPIRBasicBlock>(VPBB)) {
errs() << "VPIRInstructions ";
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
R.dump();
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b9caf8c0df9b..493ed95b1d22 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -67,9 +67,10 @@ class VectorCombine {
public:
VectorCombine(Function &F, const TargetTransformInfo &TTI,
const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
- const DataLayout *DL, bool TryEarlyFoldsOnly)
+ const DataLayout *DL, TTI::TargetCostKind CostKind,
+ bool TryEarlyFoldsOnly)
: F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL),
- TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
+ CostKind(CostKind), TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
bool run();
@@ -81,6 +82,7 @@ private:
AAResults &AA;
AssumptionCache &AC;
const DataLayout *DL;
+ TTI::TargetCostKind CostKind;
/// If true, only perform beneficial early IR transforms. Do not introduce new
/// vector operations.
@@ -113,6 +115,7 @@ private:
bool foldExtractedCmps(Instruction &I);
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
+ bool foldConcatOfBoolMasks(Instruction &I);
bool foldPermuteOfBinops(Instruction &I);
bool foldShuffleOfBinops(Instruction &I);
bool foldShuffleOfCastops(Instruction &I);
@@ -125,6 +128,8 @@ private:
bool shrinkType(Instruction &I);
void replaceValue(Value &Old, Value &New) {
+ LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
+ LLVM_DEBUG(dbgs() << " With: " << New << '\n');
Old.replaceAllUsesWith(&New);
if (auto *NewI = dyn_cast<Instruction>(&New)) {
New.takeName(&Old);
@@ -135,10 +140,18 @@ private:
}
void eraseInstruction(Instruction &I) {
- for (Value *Op : I.operands())
- Worklist.pushValue(Op);
+ LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
+ SmallVector<Value *> Ops(I.operands());
Worklist.remove(&I);
I.eraseFromParent();
+
+ // Push remaining users of the operands and then the operand itself - allows
+ // further folds that were hindered by OneUse limits.
+ for (Value *Op : Ops)
+ if (auto *OpI = dyn_cast<Instruction>(Op)) {
+ Worklist.pushUsersToWorkList(*OpI);
+ Worklist.pushValue(OpI);
+ }
}
};
} // namespace
@@ -176,8 +189,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// Match insert into fixed vector of scalar value.
// TODO: Handle non-zero insert index.
Value *Scalar;
- if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
- !Scalar->hasOneUse())
+ if (!match(&I,
+ m_InsertElt(m_Poison(), m_OneUse(m_Value(Scalar)), m_ZeroInt())))
return false;
// Optionally match an extract from another vector.
@@ -247,16 +260,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
Type *LoadTy = Load->getType();
unsigned AS = Load->getPointerAddressSpace();
InstructionCost OldCost =
- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
OldCost +=
TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
/* Insert */ true, HasExtract, CostKind);
// New pattern: load VecPtr
InstructionCost NewCost =
- TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
+ TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
// Optionally, we are shuffling the loaded vector element(s) into place.
// For the mask set everything but element 0 to undef to prevent poison from
// propagating from the extra loaded memory. This will also optionally
@@ -270,7 +282,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
Mask[0] = OffsetEltIndex;
if (OffsetEltIndex)
- NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask);
+ NewCost +=
+ TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind);
// We can aggressively convert to the vector form because the backend can
// invert this transform if it does not result in a performance win.
@@ -329,11 +342,11 @@ bool VectorCombine::widenSubvectorLoad(Instruction &I) {
// undef value is 0. We could add that cost if the cost model accurately
// reflects the real cost of that operation.
InstructionCost OldCost =
- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
// New pattern: load PtrOp
InstructionCost NewCost =
- TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS);
+ TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
// We can aggressively convert to the vector form because the backend can
// invert this transform if it does not result in a performance win.
@@ -366,7 +379,6 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
return nullptr;
Type *VecTy = Ext0->getVectorOperand()->getType();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
InstructionCost Cost0 =
TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
@@ -420,23 +432,22 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
// Get cost estimates for scalar and vector versions of the operation.
bool IsBinOp = Instruction::isBinaryOp(Opcode);
if (IsBinOp) {
- ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
- VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+ ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
+ VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
} else {
assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
"Expected a compare");
CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
ScalarOpCost = TTI.getCmpSelInstrCost(
- Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+ Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
VectorOpCost = TTI.getCmpSelInstrCost(
- Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
+ Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
}
// Get cost estimates for the extract elements. These costs will factor into
// both sequences.
unsigned Ext0Index = Ext0IndexC->getZExtValue();
unsigned Ext1Index = Ext1IndexC->getZExtValue();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Extract0Cost =
TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
@@ -596,7 +607,7 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
return false;
Instruction *I0, *I1;
- CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
!match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
return false;
@@ -665,9 +676,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
return false;
- // TODO: We could handle this with a length-changing shuffle.
auto *VecTy = cast<FixedVectorType>(I.getType());
- if (SrcVec->getType() != VecTy)
+ auto *ScalarTy = VecTy->getScalarType();
+ auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
+ if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
return false;
// Ignore bogus insert/extract index.
@@ -681,11 +693,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
SmallVector<int> Mask(NumElts);
std::iota(Mask.begin(), Mask.end(), 0);
Mask[Index] = Index + NumElts;
-
- Type *ScalarTy = VecTy->getScalarType();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +
+ TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
// If the extract has one use, it will be eliminated, so count it in the
@@ -695,17 +704,36 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
InstructionCost NewCost =
- TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
+ TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind);
+
+ bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
+ // If the lengths of the two vectors are not equal,
+ // we need to add a length-change vector. Add this cost.
+ SmallVector<int> SrcMask;
+ if (NeedLenChg) {
+ SrcMask.assign(NumElts, PoisonMaskElem);
+ SrcMask[Index] = Index;
+ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ SrcVecTy, SrcMask, CostKind);
+ }
if (NewCost > OldCost)
return false;
- // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
- // shuffle DestVec, (fneg SrcVec), Mask
+ Value *NewShuf;
+ // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
- Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
- replaceValue(I, *Shuf);
+ if (NeedLenChg) {
+ // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
+ Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
+ NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
+ } else {
+ // shuffle DestVec, (fneg SrcVec), Mask
+ NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
+ }
+
+ replaceValue(I, *NewShuf);
return true;
}
@@ -772,22 +800,25 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) {
unsigned NumOps = IsUnary ? 1 : 2;
// The new shuffle must not cost more than the old shuffle.
- TargetTransformInfo::TargetCostKind CK =
- TargetTransformInfo::TCK_RecipThroughput;
TargetTransformInfo::ShuffleKind SK =
IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
: TargetTransformInfo::SK_PermuteTwoSrc;
- InstructionCost DestCost =
- TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) +
+ InstructionCost NewCost =
+ TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CostKind) +
(NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
TargetTransformInfo::CastContextHint::None,
- CK));
- InstructionCost SrcCost =
- TTI.getShuffleCost(SK, SrcTy, Mask, CK) +
+ CostKind));
+ InstructionCost OldCost =
+ TTI.getShuffleCost(SK, SrcTy, Mask, CostKind) +
TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
- TargetTransformInfo::CastContextHint::None, CK);
- if (DestCost > SrcCost || !DestCost.isValid())
+ TargetTransformInfo::CastContextHint::None,
+ CostKind);
+
+ LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
+ << OldCost << " vs NewCost: " << NewCost << "\n");
+
+ if (NewCost > OldCost || !NewCost.isValid())
return false;
// bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
@@ -841,13 +872,13 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
// Calculate cost of splatting both operands into vectors and the vector
// intrinsic
VectorType *VecTy = cast<VectorType>(VPI.getType());
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
SmallVector<int> Mask;
if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
Mask.resize(FVTy->getNumElements(), 0);
InstructionCost SplatCost =
TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask);
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask,
+ CostKind);
// Calculate the cost of the VP Intrinsic
SmallVector<Type *, 4> Args;
@@ -873,8 +904,8 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
} else {
- ScalarOpCost =
- TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType());
+ ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
+ VecTy->getScalarType(), CostKind);
}
// The existing splats may be kept around if other instructions use them.
@@ -924,7 +955,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
/// Match a vector binop or compare instruction with at least one inserted
/// scalar operand and convert to scalar binop/cmp followed by insertelement.
bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
- CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
Value *Ins0, *Ins1;
if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
!match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
@@ -993,17 +1024,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
if (IsCmp) {
CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
ScalarOpCost = TTI.getCmpSelInstrCost(
- Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+ Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
VectorOpCost = TTI.getCmpSelInstrCost(
- Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
+ Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
} else {
- ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
- VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+ ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
+ VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
}
// Get cost estimate for the insert element. This cost will factor into
// both sequences.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost InsertCost = TTI.getVectorInstrCost(
Instruction::InsertElement, VecTy, CostKind, Index);
InstructionCost OldCost =
@@ -1065,9 +1095,11 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
Instruction *I0, *I1;
Constant *C0, *C1;
- CmpInst::Predicate P0, P1;
+ CmpPredicate P0, P1;
+ // FIXME: Use CmpPredicate::getMatching here.
if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
- !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) || P0 != P1)
+ !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) ||
+ P0 != static_cast<CmpInst::Predicate>(P1))
return false;
// The compare operands must be extracts of the same vector with constant
@@ -1080,7 +1112,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
auto *Ext0 = cast<ExtractElementInst>(I0);
auto *Ext1 = cast<ExtractElementInst>(I1);
- ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
+ ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
if (!ConvertToShuf)
return false;
assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
@@ -1089,23 +1121,23 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
// The original scalar pattern is:
// binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
CmpInst::Predicate Pred = P0;
- unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
- : Instruction::ICmp;
+ unsigned CmpOpcode =
+ CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
if (!VecTy)
return false;
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Ext0Cost =
- TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0),
- Ext1Cost =
- TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
+ TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
+ InstructionCost Ext1Cost =
+ TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
+ InstructionCost CmpCost = TTI.getCmpSelInstrCost(
+ CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
+ CostKind);
+
InstructionCost OldCost =
- Ext0Cost + Ext1Cost +
- TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
- CmpInst::makeCmpResultType(I0->getType()), Pred) *
- 2 +
- TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
+ Ext0Cost + Ext1Cost + CmpCost * 2 +
+ TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
// The proposed vector pattern is:
// vcmp = cmp Pred X, VecC
@@ -1114,12 +1146,13 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
InstructionCost NewCost = TTI.getCmpSelInstrCost(
- CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred);
+ CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred,
+ CostKind);
SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
ShufMask[CheapIndex] = ExpensiveIndex;
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
- ShufMask);
- NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
+ ShufMask, CostKind);
+ NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
@@ -1311,6 +1344,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
MemoryLocation::get(SI), AA))
return false;
+ // Ensure we add the load back to the worklist BEFORE its users so they can
+ // erased in the correct order.
+ Worklist.push(Load);
+
if (ScalarizableIdx.isSafeWithFreeze())
ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
Value *GEP = Builder.CreateInBoundsGEP(
@@ -1336,14 +1373,14 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
if (!match(&I, m_Load(m_Value(Ptr))))
return false;
- auto *VecTy = cast<VectorType>(I.getType());
auto *LI = cast<LoadInst>(&I);
+ auto *VecTy = cast<VectorType>(LI->getType());
if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
return false;
InstructionCost OriginalCost =
TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
- LI->getPointerAddressSpace());
+ LI->getPointerAddressSpace(), CostKind);
InstructionCost ScalarizedCost = 0;
Instruction *LastCheckedInst = LI;
@@ -1377,7 +1414,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
LastCheckedInst = UI;
}
- auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT);
+ auto ScalarIdx =
+ canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
if (ScalarIdx.isUnsafe())
return false;
if (ScalarIdx.isSafeWithFreeze()) {
@@ -1385,24 +1423,27 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
ScalarIdx.discard();
}
- auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
OriginalCost +=
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
Index ? Index->getZExtValue() : -1);
ScalarizedCost +=
TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
- Align(1), LI->getPointerAddressSpace());
+ Align(1), LI->getPointerAddressSpace(), CostKind);
ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType());
}
if (ScalarizedCost >= OriginalCost)
return false;
+ // Ensure we add the load back to the worklist BEFORE its users so they can
+ // erased in the correct order.
+ Worklist.push(LI);
+
// Replace extracts with narrow scalar loads.
for (User *U : LI->users()) {
auto *EI = cast<ExtractElementInst>(U);
- Value *Idx = EI->getOperand(1);
+ Value *Idx = EI->getIndexOperand();
// Insert 'freeze' for poison indexes.
auto It = NeedFreeze.find(EI);
@@ -1426,6 +1467,117 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
return true;
}
+/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
+/// to "(bitcast (concat X, Y))"
+/// where X/Y are bitcasted from i1 mask vectors.
+bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
+ Type *Ty = I.getType();
+ if (!Ty->isIntegerTy())
+ return false;
+
+ // TODO: Add big endian test coverage
+ if (DL->isBigEndian())
+ return false;
+
+ // Restrict to disjoint cases so the mask vectors aren't overlapping.
+ Instruction *X, *Y;
+ if (!match(&I, m_DisjointOr(m_Instruction(X), m_Instruction(Y))))
+ return false;
+
+ // Allow both sources to contain shl, to handle more generic pattern:
+ // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
+ Value *SrcX;
+ uint64_t ShAmtX = 0;
+ if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
+ !match(X, m_OneUse(
+ m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX))))),
+ m_ConstantInt(ShAmtX)))))
+ return false;
+
+ Value *SrcY;
+ uint64_t ShAmtY = 0;
+ if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
+ !match(Y, m_OneUse(
+ m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY))))),
+ m_ConstantInt(ShAmtY)))))
+ return false;
+
+ // Canonicalize larger shift to the RHS.
+ if (ShAmtX > ShAmtY) {
+ std::swap(X, Y);
+ std::swap(SrcX, SrcY);
+ std::swap(ShAmtX, ShAmtY);
+ }
+
+ // Ensure both sources are matching vXi1 bool mask types, and that the shift
+ // difference is the mask width so they can be easily concatenated together.
+ uint64_t ShAmtDiff = ShAmtY - ShAmtX;
+ unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
+ unsigned BitWidth = Ty->getPrimitiveSizeInBits();
+ auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
+ if (!MaskTy || SrcX->getType() != SrcY->getType() ||
+ !MaskTy->getElementType()->isIntegerTy(1) ||
+ MaskTy->getNumElements() != ShAmtDiff ||
+ MaskTy->getNumElements() > (BitWidth / 2))
+ return false;
+
+ auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
+ auto *ConcatIntTy =
+ Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
+ auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
+
+ SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+
+ // TODO: Is it worth supporting multi use cases?
+ InstructionCost OldCost = 0;
+ OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
+ OldCost +=
+ NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
+ OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
+ TTI::CastContextHint::None, CostKind);
+ OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
+ TTI::CastContextHint::None, CostKind);
+
+ InstructionCost NewCost = 0;
+ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, MaskTy,
+ ConcatMask, CostKind);
+ NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
+ TTI::CastContextHint::None, CostKind);
+ if (Ty != ConcatIntTy)
+ NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
+ TTI::CastContextHint::None, CostKind);
+ if (ShAmtX > 0)
+ NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
+
+ LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
+ << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
+
+ if (NewCost > OldCost)
+ return false;
+
+ // Build bool mask concatenation, bitcast back to scalar integer, and perform
+ // any residual zero-extension or shifting.
+ Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
+ Worklist.pushValue(Concat);
+
+ Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
+
+ if (Ty != ConcatIntTy) {
+ Worklist.pushValue(Result);
+ Result = Builder.CreateZExt(Result, Ty);
+ }
+
+ if (ShAmtX > 0) {
+ Worklist.pushValue(Result);
+ Result = Builder.CreateShl(Result, ShAmtX);
+ }
+
+ replaceValue(I, *Result);
+ return true;
+}
+
/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
/// --> "binop (shuffle), (shuffle)".
bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
@@ -1480,8 +1632,6 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
}
// Try to merge shuffles across the binop if the new shuffles are not costly.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
@@ -1523,34 +1673,46 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
}
/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
+/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
- BinaryOperator *B0, *B1;
ArrayRef<int> OldMask;
- if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
- m_Mask(OldMask))))
- return false;
-
- // Don't introduce poison into div/rem.
- if (llvm::is_contained(OldMask, PoisonMaskElem) && B0->isIntDivRem())
+ Instruction *LHS, *RHS;
+ if (!match(&I, m_Shuffle(m_OneUse(m_Instruction(LHS)),
+ m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
return false;
// TODO: Add support for addlike etc.
- Instruction::BinaryOps Opcode = B0->getOpcode();
- if (Opcode != B1->getOpcode())
+ if (LHS->getOpcode() != RHS->getOpcode())
+ return false;
+
+ Value *X, *Y, *Z, *W;
+ bool IsCommutative = false;
+ CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
+ CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
+ if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
+ match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
+ auto *BO = cast<BinaryOperator>(LHS);
+ // Don't introduce poison into div/rem.
+ if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
+ return false;
+ IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
+ } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
+ match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
+ (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
+ IsCommutative = cast<CmpInst>(LHS)->isCommutative();
+ } else
return false;
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
- auto *BinOpTy = dyn_cast<FixedVectorType>(B0->getType());
- if (!ShuffleDstTy || !BinOpTy)
+ auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
+ auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
+ if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
return false;
unsigned NumSrcElts = BinOpTy->getNumElements();
// If we have something like "add X, Y" and "add Z, X", swap ops to match.
- Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
- Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
- if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
- (X == W || Y == Z))
+ if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
std::swap(X, Y);
auto ConvertToUnary = [NumSrcElts](int &M) {
@@ -1575,33 +1737,48 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
}
// Try to replace a binop with a shuffle if the shuffle is not costly.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy, CostKind) +
- TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy, CostKind) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
- OldMask, CostKind, 0, nullptr, {B0, B1}, &I);
+ TTI.getInstructionCost(LHS, CostKind) +
+ TTI.getInstructionCost(RHS, CostKind) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
+ OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
InstructionCost NewCost =
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
- TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) +
- TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
+ TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
+
+ if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
+ NewCost +=
+ TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
+ } else {
+ auto *ShuffleCmpTy =
+ FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
+ NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
+ ShuffleDstTy, PredLHS, CostKind);
+ }
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
<< "\n");
- if (NewCost >= OldCost)
+
+ // If either shuffle will constant fold away, then fold for the same cost as
+ // we will reduce the instruction count.
+ bool ReducedInstCount = (isa<Constant>(X) && isa<Constant>(Z)) ||
+ (isa<Constant>(Y) && isa<Constant>(W));
+ if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
return false;
Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
- Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+ Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
+ ? Builder.CreateBinOp(
+ cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
+ : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
// Intersect flags from the old binops.
if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
- NewInst->copyIRFlags(B0);
- NewInst->andIRFlags(B1);
+ NewInst->copyIRFlags(LHS);
+ NewInst->andIRFlags(RHS);
}
Worklist.pushValue(Shuf0);
@@ -1672,8 +1849,6 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
// Try to replace a castop with a shuffle if the shuffle is not costly.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
InstructionCost CostC0 =
TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
TTI::CastContextHint::None, CostKind);
@@ -1715,77 +1890,123 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
return true;
}
-/// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)"
+/// Try to convert any of:
+/// "shuffle (shuffle x, y), (shuffle y, x)"
+/// "shuffle (shuffle x, undef), (shuffle y, undef)"
+/// "shuffle (shuffle x, undef), y"
+/// "shuffle x, (shuffle y, undef)"
/// into "shuffle x, y".
bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
- Value *V0, *V1;
- UndefValue *U0, *U1;
- ArrayRef<int> OuterMask, InnerMask0, InnerMask1;
+ ArrayRef<int> OuterMask;
+ Value *OuterV0, *OuterV1;
if (!match(&I,
- m_Shuffle(
- m_Shuffle(m_Value(V0), m_UndefValue(U0), m_Mask(InnerMask0)),
- m_Shuffle(m_Value(V1), m_UndefValue(U1), m_Mask(InnerMask1)),
- m_Mask(OuterMask))))
+ m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
+ return false;
+
+ ArrayRef<int> InnerMask0, InnerMask1;
+ Value *X0, *X1, *Y0, *Y1;
+ bool Match0 =
+ match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
+ bool Match1 =
+ match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
+ if (!Match0 && !Match1)
return false;
- auto *ShufI0 = dyn_cast<Instruction>(I.getOperand(0));
- auto *ShufI1 = dyn_cast<Instruction>(I.getOperand(1));
+ X0 = Match0 ? X0 : OuterV0;
+ Y0 = Match0 ? Y0 : OuterV0;
+ X1 = Match1 ? X1 : OuterV1;
+ Y1 = Match1 ? Y1 : OuterV1;
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
- auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType());
- auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType());
+ auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
+ auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
- V0->getType() != V1->getType())
+ X0->getType() != X1->getType())
return false;
unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
unsigned NumImmElts = ShuffleImmTy->getNumElements();
- // Bail if either inner masks reference a RHS undef arg.
- if ((!isa<PoisonValue>(U0) &&
- any_of(InnerMask0, [&](int M) { return M >= (int)NumSrcElts; })) ||
- (!isa<PoisonValue>(U1) &&
- any_of(InnerMask1, [&](int M) { return M >= (int)NumSrcElts; })))
- return false;
-
- // Merge shuffles - replace index to the RHS poison arg with PoisonMaskElem,
+ // Attempt to merge shuffles, matching upto 2 source operands.
+ // Replace index to a poison arg with PoisonMaskElem.
+ // Bail if either inner masks reference an undef arg.
SmallVector<int, 16> NewMask(OuterMask);
+ Value *NewX = nullptr, *NewY = nullptr;
for (int &M : NewMask) {
+ Value *Src = nullptr;
if (0 <= M && M < (int)NumImmElts) {
- M = (InnerMask0[M] >= (int)NumSrcElts) ? PoisonMaskElem : InnerMask0[M];
+ Src = OuterV0;
+ if (Match0) {
+ M = InnerMask0[M];
+ Src = M >= (int)NumSrcElts ? Y0 : X0;
+ M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
+ }
} else if (M >= (int)NumImmElts) {
- if (InnerMask1[M - NumImmElts] >= (int)NumSrcElts)
+ Src = OuterV1;
+ M -= NumImmElts;
+ if (Match1) {
+ M = InnerMask1[M];
+ Src = M >= (int)NumSrcElts ? Y1 : X1;
+ M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
+ }
+ }
+ if (Src && M != PoisonMaskElem) {
+ assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
+ if (isa<UndefValue>(Src)) {
+ // We've referenced an undef element - if its poison, update the shuffle
+ // mask, else bail.
+ if (!isa<PoisonValue>(Src))
+ return false;
M = PoisonMaskElem;
- else
- M = InnerMask1[M - NumImmElts] + (V0 == V1 ? 0 : NumSrcElts);
+ continue;
+ }
+ if (!NewX || NewX == Src) {
+ NewX = Src;
+ continue;
+ }
+ if (!NewY || NewY == Src) {
+ M += NumSrcElts;
+ NewY = Src;
+ continue;
+ }
+ return false;
}
}
+ if (!NewX)
+ return PoisonValue::get(ShuffleDstTy);
+ if (!NewY)
+ NewY = PoisonValue::get(ShuffleSrcTy);
+
// Have we folded to an Identity shuffle?
if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
- replaceValue(I, *V0);
+ replaceValue(I, *NewX);
return true;
}
// Try to merge the shuffles if the new shuffle is not costly.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
- InstructionCost InnerCost0 =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
- InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0);
- InstructionCost InnerCost1 =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
- InnerMask1, CostKind, 0, nullptr, {V1, U1}, ShufI1);
- InstructionCost OuterCost =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy,
- OuterMask, CostKind, 0, nullptr, {ShufI0, ShufI1}, &I);
+ InstructionCost InnerCost0 = 0;
+ if (Match0)
+ InnerCost0 = TTI.getInstructionCost(cast<Instruction>(OuterV0), CostKind);
+
+ InstructionCost InnerCost1 = 0;
+ if (Match1)
+ InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind);
+
+ InstructionCost OuterCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, OuterMask, CostKind,
+ 0, nullptr, {OuterV0, OuterV1}, &I);
+
InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
- InstructionCost NewCost =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy,
- NewMask, CostKind, 0, nullptr, {V0, V1});
- if (!ShufI0->hasOneUse())
+ bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
+ TargetTransformInfo::ShuffleKind SK =
+ IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
+ : TargetTransformInfo::SK_PermuteTwoSrc;
+ InstructionCost NewCost = TTI.getShuffleCost(
+ SK, ShuffleSrcTy, NewMask, CostKind, 0, nullptr, {NewX, NewY});
+ if (!OuterV0->hasOneUse())
NewCost += InnerCost0;
- if (!ShufI1->hasOneUse())
+ if (!OuterV1->hasOneUse())
NewCost += InnerCost1;
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
@@ -1794,13 +2015,7 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
if (NewCost > OldCost)
return false;
- // Clear unused sources to poison.
- if (none_of(NewMask, [&](int M) { return 0 <= M && M < (int)NumSrcElts; }))
- V0 = PoisonValue::get(ShuffleSrcTy);
- if (none_of(NewMask, [&](int M) { return (int)NumSrcElts <= M; }))
- V1 = PoisonValue::get(ShuffleSrcTy);
-
- Value *Shuf = Builder.CreateShuffleVector(V0, V1, NewMask);
+ Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
replaceValue(I, *Shuf);
return true;
}
@@ -1832,32 +2047,30 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
return false;
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
- if (isVectorIntrinsicWithScalarOpAtArg(IID, I) &&
+ if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI) &&
II0->getArgOperand(I) != II1->getArgOperand(I))
return false;
InstructionCost OldCost =
- TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0),
- TTI::TCK_RecipThroughput) +
- TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1),
- TTI::TCK_RecipThroughput) +
+ TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
+ TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask,
- TTI::TCK_RecipThroughput, 0, nullptr, {II0, II1}, &I);
+ CostKind, 0, nullptr, {II0, II1}, &I);
SmallVector<Type *> NewArgsTy;
InstructionCost NewCost = 0;
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
- if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
NewArgsTy.push_back(II0->getArgOperand(I)->getType());
} else {
auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
VecTy->getNumElements() * 2));
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
- VecTy, OldMask, TTI::TCK_RecipThroughput);
+ VecTy, OldMask, CostKind);
}
IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
- NewCost += TTI.getIntrinsicInstrCost(NewAttr, TTI::TCK_RecipThroughput);
+ NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -1868,7 +2081,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
SmallVector<Value *> NewArgs;
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
- if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
NewArgs.push_back(II0->getArgOperand(I));
} else {
Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
@@ -1923,7 +2136,7 @@ generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
}
/// Detect concat of multiple values into a vector
-static bool isFreeConcat(ArrayRef<InstLane> Item,
+static bool isFreeConcat(ArrayRef<InstLane> Item, TTI::TargetCostKind CostKind,
const TargetTransformInfo &TTI) {
auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
unsigned NumElts = Ty->getNumElements();
@@ -1934,8 +2147,7 @@ static bool isFreeConcat(ArrayRef<InstLane> Item,
// during legalization.
SmallVector<int, 16> ConcatMask(NumElts * 2);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
- if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask,
- TTI::TCK_RecipThroughput) != 0)
+ if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, CostKind) != 0)
return false;
unsigned NumSlices = Item.size() / NumElts;
@@ -1960,7 +2172,8 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
const SmallPtrSet<Use *, 4> &IdentityLeafs,
const SmallPtrSet<Use *, 4> &SplatLeafs,
const SmallPtrSet<Use *, 4> &ConcatLeafs,
- IRBuilder<> &Builder) {
+ IRBuilder<> &Builder,
+ const TargetTransformInfo *TTI) {
auto [FrontU, FrontLane] = Item.front();
if (IdentityLeafs.contains(FrontU)) {
@@ -1995,13 +2208,14 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
SmallVector<Value *> Ops(NumOps);
for (unsigned Idx = 0; Idx < NumOps; Idx++) {
- if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) {
+ if (II &&
+ isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
Ops[Idx] = II->getOperand(Idx);
continue;
}
- Ops[Idx] =
- generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty,
- IdentityLeafs, SplatLeafs, ConcatLeafs, Builder);
+ Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
+ Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
+ Builder, TTI);
}
SmallVector<Value *, 8> ValueList;
@@ -2097,7 +2311,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
all_of(drop_begin(Item), [Item](InstLane &IL) {
Value *FrontV = Item.front().first->get();
Use *U = IL.first;
- return !U || U->get() == FrontV;
+ return !U || (isa<Constant>(U->get()) &&
+ cast<Constant>(U->get())->getSplatValue() ==
+ cast<Constant>(FrontV)->getSplatValue());
})) {
SplatLeafs.insert(FrontU);
continue;
@@ -2127,7 +2343,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
return false;
if (auto *CI = dyn_cast<CastInst>(V))
- if (CI->getSrcTy() != cast<CastInst>(FrontV)->getSrcTy())
+ if (CI->getSrcTy()->getScalarType() !=
+ cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
return false;
if (auto *SI = dyn_cast<SelectInst>(V))
if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
@@ -2152,7 +2369,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
continue;
- } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontU)) {
+ } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
+ FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
continue;
} else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
@@ -2173,7 +2391,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
II && isTriviallyVectorizable(II->getIntrinsicID()) &&
!II->hasOperandBundles()) {
for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
- if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
+ &TTI)) {
if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
Value *FrontV = Item.front().first->get();
Use *U = IL.first;
@@ -2189,7 +2408,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
}
}
- if (isFreeConcat(Item, TTI)) {
+ if (isFreeConcat(Item, CostKind, TTI)) {
ConcatLeafs.insert(FrontU);
continue;
}
@@ -2200,11 +2419,13 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (NumVisited <= 1)
return false;
+ LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
+
// If we got this far, we know the shuffles are superfluous and can be
// removed. Scan through again and generate the new tree of instructions.
Builder.SetInsertPoint(&I);
Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
- ConcatLeafs, Builder);
+ ConcatLeafs, Builder, &TTI);
replaceValue(I, *V);
return true;
}
@@ -2306,10 +2527,10 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
(UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;
InstructionCost OldCost = TTI.getShuffleCost(
UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
- VecTyForCost, Shuffle->getShuffleMask());
+ VecTyForCost, Shuffle->getShuffleMask(), CostKind);
InstructionCost NewCost = TTI.getShuffleCost(
UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
- VecTyForCost, ConcatMask);
+ VecTyForCost, ConcatMask, CostKind);
LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
<< "\n");
@@ -2367,7 +2588,6 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
Type *ResultTy = I.getType();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost OldCost = TTI.getArithmeticReductionCost(
ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
@@ -2624,17 +2844,17 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
? TTI::SK_PermuteSingleSrc
: TTI::SK_PermuteTwoSrc,
- VT, SV->getShuffleMask());
+ VT, SV->getShuffleMask(), CostKind);
};
auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
- return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);
+ return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask, CostKind);
};
// Get the costs of the shuffles + binops before and after with the new
// shuffle masks.
InstructionCost CostBefore =
- TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) +
- TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);
+ TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
+ TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
InstructionCost(0), AddShuffleCost);
CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
@@ -2647,8 +2867,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
FixedVectorType *Op1SmallVT =
FixedVectorType::get(VT->getScalarType(), V2.size());
InstructionCost CostAfter =
- TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) +
- TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT);
+ TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
+ TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
InstructionCost(0), AddShuffleMaskCost);
std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
@@ -2717,7 +2937,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
/// Cost model calculations takes into account if zext(x) has other users and
/// whether it can be propagated through them too.
-bool VectorCombine::shrinkType(llvm::Instruction &I) {
+bool VectorCombine::shrinkType(Instruction &I) {
Value *ZExted, *OtherOperand;
if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
m_Value(OtherOperand))) &&
@@ -2746,7 +2966,6 @@ bool VectorCombine::shrinkType(llvm::Instruction &I) {
// Calculate costs of leaving current IR as it is and moving ZExt operation
// later, along with adding truncates if needed
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost ZExtCost = TTI.getCastInstrCost(
Instruction::ZExt, BigTy, SmallTy,
TargetTransformInfo::CastContextHint::None, CostKind);
@@ -2826,26 +3045,46 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
if (ExtIdx >= NumElts || InsIdx >= NumElts)
return false;
- SmallVector<int> Mask(NumElts, 0);
- std::iota(Mask.begin(), Mask.end(), 0);
- Mask[InsIdx] = ExtIdx + NumElts;
+ // Insertion into poison is a cheaper single operand shuffle.
+ TargetTransformInfo::ShuffleKind SK;
+ SmallVector<int> Mask(NumElts, PoisonMaskElem);
+ if (isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
+ SK = TargetTransformInfo::SK_PermuteSingleSrc;
+ Mask[InsIdx] = ExtIdx;
+ std::swap(DstVec, SrcVec);
+ } else {
+ SK = TargetTransformInfo::SK_PermuteTwoSrc;
+ std::iota(Mask.begin(), Mask.end(), 0);
+ Mask[InsIdx] = ExtIdx + NumElts;
+ }
+
// Cost
auto *Ins = cast<InsertElementInst>(&I);
auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
-
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- InstructionCost OldCost =
- TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx) +
+ InstructionCost InsCost =
TTI.getVectorInstrCost(*Ins, VecTy, CostKind, InsIdx);
+ InstructionCost ExtCost =
+ TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
+ InstructionCost OldCost = ExtCost + InsCost;
- InstructionCost NewCost =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask);
+ InstructionCost NewCost = TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0,
+ nullptr, {DstVec, SrcVec});
if (!Ext->hasOneUse())
- NewCost += TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
+ NewCost += ExtCost;
+
+ LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair : " << I
+ << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
if (OldCost < NewCost)
return false;
+ // Canonicalize undef param to RHS to help further folds.
+ if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
+ ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+ std::swap(DstVec, SrcVec);
+ }
+
Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
replaceValue(I, *Shuf);
@@ -2862,12 +3101,17 @@ bool VectorCombine::run() {
if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
return false;
+ LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
+
bool MadeChange = false;
auto FoldInst = [this, &MadeChange](Instruction &I) {
Builder.SetInsertPoint(&I);
+ bool IsVectorType = isa<VectorType>(I.getType());
bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
auto Opcode = I.getOpcode();
+ LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
+
// These folds should be beneficial regardless of when this pass is run
// in the optimization pipeline.
// The type checking is for run-time efficiency. We can avoid wasting time
@@ -2887,7 +3131,7 @@ bool VectorCombine::run() {
// This transform works with scalable and fixed vectors
// TODO: Identify and allow other scalable transforms
- if (isa<VectorType>(I.getType())) {
+ if (IsVectorType) {
MadeChange |= scalarizeBinopOrCmp(I);
MadeChange |= scalarizeLoadExtract(I);
MadeChange |= scalarizeVPIntrinsic(I);
@@ -2936,6 +3180,9 @@ bool VectorCombine::run() {
case Instruction::FCmp:
MadeChange |= foldExtractExtract(I);
break;
+ case Instruction::Or:
+ MadeChange |= foldConcatOfBoolMasks(I);
+ [[fallthrough]];
default:
if (Instruction::isBinaryOp(Opcode)) {
MadeChange |= foldExtractExtract(I);
@@ -2981,7 +3228,8 @@ PreservedAnalyses VectorCombinePass::run(Function &F,
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
AAResults &AA = FAM.getResult<AAManager>(F);
const DataLayout *DL = &F.getDataLayout();
- VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly);
+ VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
+ TryEarlyFoldsOnly);
if (!Combiner.run())
return PreservedAnalyses::all();
PreservedAnalyses PA;