summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp9
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp49
-rw-r--r--llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp47
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp205
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp16
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h68
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp16
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp56
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp147
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp43
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp71
14 files changed, 620 insertions, 127 deletions
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 3986359b6a5a..4df18c824927 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -583,10 +583,8 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
// RemoveDIs: there's no bitcode representation of the DbgVariableRecord
// debug-info, convert to dbg.values before writing out.
- bool ConvertToOldDbgFormatForWrite =
- M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
- if (ConvertToOldDbgFormatForWrite)
- M.convertFromNewDbgValues();
+ ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat &&
+ WriteNewDbgInfoFormatToBitcode);
bool Changed = writeThinLTOBitcode(
OS, ThinLinkOS,
@@ -595,8 +593,5 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
},
M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
- if (ConvertToOldDbgFormatForWrite)
- M.convertToNewDbgValues();
-
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index d0d349c891a3..ad1cd9c1f6bf 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -182,18 +182,11 @@ static cl::opt<bool> ClWithTls(
"platforms that support this"),
cl::Hidden, cl::init(true));
-static cl::opt<bool>
- CSelectiveInstrumentation("hwasan-selective-instrumentation",
- cl::desc("Use selective instrumentation"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<int> ClHotPercentileCutoff(
- "hwasan-percentile-cutoff-hot", cl::init(0),
- cl::desc("Alternative hot percentile cuttoff."
- "By default `-profile-summary-cutoff-hot` is used."));
+static cl::opt<int> ClHotPercentileCutoff("hwasan-percentile-cutoff-hot",
+ cl::desc("Hot percentile cuttoff."));
static cl::opt<float>
- ClRandomSkipRate("hwasan-random-skip-rate", cl::init(0),
+ ClRandomSkipRate("hwasan-random-skip-rate",
cl::desc("Probability value in the range [0.0, 1.0] "
"to skip instrumentation of a function."));
@@ -317,7 +310,7 @@ private:
};
bool selectiveInstrumentationShouldSkip(Function &F,
- FunctionAnalysisManager &FAM);
+ FunctionAnalysisManager &FAM) const;
void initializeModule();
void createHwasanCtorComdat();
@@ -1500,28 +1493,22 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
}
bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
- Function &F, FunctionAnalysisManager &FAM) {
+ Function &F, FunctionAnalysisManager &FAM) const {
if (ClRandomSkipRate.getNumOccurrences()) {
std::bernoulli_distribution D(ClRandomSkipRate);
- if (D(*Rng))
- return true;
- } else {
- auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
- ProfileSummaryInfo *PSI =
- MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
- if (PSI && PSI->hasProfileSummary()) {
- auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
- if ((ClHotPercentileCutoff.getNumOccurrences() &&
- ClHotPercentileCutoff >= 0)
- ? PSI->isFunctionHotInCallGraphNthPercentile(
- ClHotPercentileCutoff, &F, BFI)
- : PSI->isFunctionHotInCallGraph(&F, BFI))
- return true;
- } else {
- ++NumNoProfileSummaryFuncs;
- }
+ return (D(*Rng));
}
- return false;
+ if (!ClHotPercentileCutoff.getNumOccurrences())
+ return false;
+ auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+ ProfileSummaryInfo *PSI =
+ MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ if (!PSI || !PSI->hasProfileSummary()) {
+ ++NumNoProfileSummaryFuncs;
+ return false;
+ }
+ return PSI->isFunctionHotInCallGraphNthPercentile(
+ ClHotPercentileCutoff, &F, FAM.getResult<BlockFrequencyAnalysis>(F));
}
void HWAddressSanitizer::sanitizeFunction(Function &F,
@@ -1537,7 +1524,7 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
NumTotalFuncs++;
- if (CSelectiveInstrumentation && selectiveInstrumentationShouldSkip(F, FAM))
+ if (selectiveInstrumentationShouldSkip(F, FAM))
return;
NumInstrumentedFuncs++;
diff --git a/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp b/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp
index d87f7482a21d..6adc29f8572b 100644
--- a/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp
@@ -11,6 +11,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
@@ -22,13 +23,11 @@ using namespace llvm;
#define DEBUG_TYPE "remove-traps"
-static cl::opt<int> HotPercentileCutoff(
- "remove-traps-percentile-cutoff-hot", cl::init(0),
- cl::desc("Alternative hot percentile cuttoff. By default "
- "`-profile-summary-cutoff-hot` is used."));
+static cl::opt<int> HotPercentileCutoff("remove-traps-percentile-cutoff-hot",
+ cl::desc("Hot percentile cuttoff."));
static cl::opt<float>
- RandomRate("remove-traps-random-rate", cl::init(0.0),
+ RandomRate("remove-traps-random-rate",
cl::desc("Probability value in the range [0.0, 1.0] of "
"unconditional pseudo-random checks removal."));
@@ -37,9 +36,11 @@ STATISTIC(NumChecksRemoved, "Number of removed checks");
static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
const ProfileSummaryInfo *PSI) {
- SmallVector<IntrinsicInst *, 16> Remove;
+ SmallVector<std::pair<IntrinsicInst *, bool>, 16> ReplaceWithValue;
std::unique_ptr<RandomNumberGenerator> Rng;
+ // TODO:
+ // https://github.com/llvm/llvm-project/pull/84858#discussion_r1520603139
auto ShouldRemove = [&](bool IsHot) {
if (!RandomRate.getNumOccurrences())
return IsHot;
@@ -56,26 +57,23 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
continue;
auto ID = II->getIntrinsicID();
switch (ID) {
- case Intrinsic::ubsantrap: {
+ case Intrinsic::allow_ubsan_check:
+ case Intrinsic::allow_runtime_check: {
++NumChecksTotal;
bool IsHot = false;
if (PSI) {
- uint64_t Count = 0;
- for (const auto *PR : predecessors(&BB))
- Count += BFI.getBlockProfileCount(PR).value_or(0);
-
- IsHot =
- HotPercentileCutoff.getNumOccurrences()
- ? (HotPercentileCutoff > 0 &&
- PSI->isHotCountNthPercentile(HotPercentileCutoff, Count))
- : PSI->isHotCount(Count);
+ uint64_t Count = BFI.getBlockProfileCount(&BB).value_or(0);
+ IsHot = PSI->isHotCountNthPercentile(HotPercentileCutoff, Count);
}
- if (ShouldRemove(IsHot)) {
- Remove.push_back(II);
+ bool ToRemove = ShouldRemove(IsHot);
+ ReplaceWithValue.push_back({
+ II,
+ ToRemove,
+ });
+ if (ToRemove)
++NumChecksRemoved;
- }
break;
}
default:
@@ -84,10 +82,12 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
}
}
- for (IntrinsicInst *I : Remove)
+ for (auto [I, V] : ReplaceWithValue) {
+ I->replaceAllUsesWith(ConstantInt::getBool(I->getType(), !V));
I->eraseFromParent();
+ }
- return !Remove.empty();
+ return !ReplaceWithValue.empty();
}
PreservedAnalyses RemoveTrapsPass::run(Function &F,
@@ -102,3 +102,8 @@ PreservedAnalyses RemoveTrapsPass::run(Function &F,
return removeUbsanTraps(F, BFI, PSI) ? PreservedAnalyses::none()
: PreservedAnalyses::all();
}
+
+bool RemoveTrapsPass::IsRequested() {
+ return RandomRate.getNumOccurrences() ||
+ HotPercentileCutoff.getNumOccurrences();
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0834865173b2..cb0fd06554e6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -124,6 +124,7 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/VectorBuilder.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
@@ -248,10 +249,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
"Create lane mask using active.lane.mask intrinsic, and use "
"it for both data and control flow"),
- clEnumValN(
- TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
- "data-and-control-without-rt-check",
- "Similar to data-and-control, but remove the runtime check")));
+ clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
+ "data-and-control-without-rt-check",
+ "Similar to data-and-control, but remove the runtime check"),
+ clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
+ "Use predicated EVL instructions for tail folding. If EVL "
+ "is unsupported, fallback to data-without-lane-mask.")));
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -1505,29 +1508,62 @@ public:
/// Returns the TailFoldingStyle that is best for the current loop.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
- return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first
- : ChosenTailFoldingStyle.second;
+ if (!ChosenTailFoldingStyle)
+ return TailFoldingStyle::None;
+ return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
+ : ChosenTailFoldingStyle->second;
}
/// Selects and saves TailFoldingStyle for 2 options - if IV update may
/// overflow or not.
- void setTailFoldingStyles() {
- assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
- ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
- "Tail folding must not be selected yet.");
- if (!Legal->prepareToFoldTailByMasking())
+ /// \param IsScalableVF true if scalable vector factors enabled.
+ /// \param UserIC User specific interleave count.
+ void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+ assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
+ if (!Legal->prepareToFoldTailByMasking()) {
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
return;
+ }
- if (ForceTailFoldingStyle.getNumOccurrences()) {
- ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second =
- ForceTailFoldingStyle;
+ if (!ForceTailFoldingStyle.getNumOccurrences()) {
+ ChosenTailFoldingStyle = std::make_pair(
+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
return;
}
- ChosenTailFoldingStyle.first =
- TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true);
- ChosenTailFoldingStyle.second =
- TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false);
+ // Set styles when forced.
+ ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
+ ForceTailFoldingStyle.getValue());
+ if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
+ return;
+ // Override forced styles if needed.
+ // FIXME: use actual opcode/data type for analysis here.
+ // FIXME: Investigate opportunity for fixed vector factor.
+ bool EVLIsLegal =
+ IsScalableVF && UserIC <= 1 &&
+ TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+ !EnableVPlanNativePath &&
+ // FIXME: implement support for max safe dependency distance.
+ Legal->isSafeForAnyVectorWidth() &&
+ // FIXME: remove this once reductions are supported.
+ Legal->getReductionVars().empty();
+ if (!EVLIsLegal) {
+ // If for some reason EVL mode is unsupported, fallback to
+ // DataWithoutLaneMask to try to vectorize the loop with folded tail
+ // in a generic way.
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
+ TailFoldingStyle::DataWithoutLaneMask);
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Preference for VP intrinsics indicated. Will "
+ "not try to generate VP Intrinsics "
+ << (UserIC > 1
+ ? "since interleave count specified is greater than 1.\n"
+ : "due to non-interleaving reasons.\n"));
+ }
}
/// Returns true if all loop blocks should be masked to fold tail loop.
@@ -1544,6 +1580,18 @@ public:
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
+ /// Returns true if VP intrinsics with explicit vector length support should
+ /// be generated in the tail folded loop.
+ bool foldTailWithEVL() const {
+ return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL &&
+ // FIXME: remove this once vp_reverse is supported.
+ none_of(
+ WideningDecisions,
+ [](const std::pair<std::pair<Instruction *, ElementCount>,
+ std::pair<InstWidening, InstructionCost>>
+ &Data) { return Data.second.first == CM_Widen_Reverse; });
+ }
+
/// Returns true if the Phi is part of an inloop reduction.
bool isInLoopReduction(PHINode *Phi) const {
return InLoopReductions.contains(Phi);
@@ -1688,8 +1736,8 @@ private:
/// Control finally chosen tail folding style. The first element is used if
/// the IV update may overflow, the second element - if it does not.
- std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle =
- std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
+ ChosenTailFoldingStyle;
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
@@ -4647,9 +4695,24 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles();
- if (foldTailByMasking())
+ setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
+ if (foldTailByMasking()) {
+ if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
+ "try to generate VP Intrinsics with scalable vector "
+ "factors only.\n");
+ // Tail folded loop using VP intrinsics restricts the VF to be scalable
+ // for now.
+ // TODO: extend it for fixed vectors, if required.
+ assert(MaxFactors.ScalableVF.isScalable() &&
+ "Expected scalable vector factor.");
+
+ MaxFactors.FixedVF = ElementCount::getFixed(1);
+ }
return MaxFactors;
+ }
// If there was a tail-folding hint/switch, but we can't fold the tail by
// masking, fallback to a vectorization with a scalar epilogue.
@@ -5257,6 +5320,13 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (!isScalarEpilogueAllowed())
return 1;
+ // Do not interleave if EVL is preferred and no User IC is specified.
+ if (foldTailWithEVL()) {
+ LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
+ "Unroll factor forced to be 1.\n");
+ return 1;
+ }
+
// We used the distance for the interleave count.
if (!Legal->isSafeForAnyVectorWidth())
return 1;
@@ -8487,6 +8557,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::truncateToMinimalBitwidths(
*Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
VPlanTransforms::optimize(*Plan, *PSE.getSE());
+ // TODO: try to put it close to addActiveLaneMask().
+ if (CM.foldTailWithEVL())
+ VPlanTransforms::addExplicitVectorLength(*Plan);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
@@ -9179,7 +9252,7 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
Value *Step = State.get(getStepValue(), VPIteration(0, 0));
- Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
+ Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
Value *DerivedIV = emitTransformedIndex(
State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
Kind, cast_if_present<BinaryOperator>(FPBinOp));
@@ -9307,6 +9380,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
}
+/// Creates either vp_store or vp_scatter intrinsics calls to represent
+/// predicated store/scatter.
+static Instruction *
+lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
+ Value *StoredVal, bool IsScatter, Value *Mask,
+ Value *EVL, const Align &Alignment) {
+ CallInst *Call;
+ if (IsScatter) {
+ Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
+ Intrinsic::vp_scatter,
+ {StoredVal, Addr, Mask, EVL});
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ Call = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Store, Type::getVoidTy(EVL->getContext()),
+ {StoredVal, Addr}));
+ }
+ Call->addParamAttr(
+ 1, Attribute::getWithAlignment(Call->getContext(), Alignment));
+ return Call;
+}
+
+/// Creates either vp_load or vp_gather intrinsics calls to represent
+/// predicated load/gather.
+static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
+ VectorType *DataTy,
+ Value *Addr, bool IsGather,
+ Value *Mask, Value *EVL,
+ const Align &Alignment) {
+ CallInst *Call;
+ if (IsGather) {
+ Call =
+ Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
+ nullptr, "wide.masked.gather");
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ Call = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Load, DataTy, Addr, "vp.op.load"));
+ }
+ Call->addParamAttr(
+ 0, Attribute::getWithAlignment(Call->getContext(), Alignment));
+ return Call;
+}
+
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
@@ -9345,7 +9464,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
for (unsigned Part = 0; Part < State.UF; ++Part) {
Instruction *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue, Part);
- if (CreateGatherScatter) {
+ // TODO: split this into several classes for better design.
+ if (State.EVL) {
+ assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+ "explicit vector length.");
+ assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
+ VPInstruction::ExplicitVectorLength &&
+ "EVL must be VPInstruction::ExplicitVectorLength.");
+ Value *EVL = State.get(State.EVL, VPIteration(0, 0));
+ // If EVL is not nullptr, then EVL must be a valid value set during plan
+ // creation, possibly default value = whole vector register length. EVL
+ // is created only if TTI prefers predicated vectorization, thus if EVL
+ // is not nullptr it also implies preference for predicated
+ // vectorization.
+ // FIXME: Support reverse store after vp_reverse is added.
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ NewSI = lowerStoreUsingVectorIntrinsics(
+ Builder, State.get(getAddr(), Part, !CreateGatherScatter),
+ StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment);
+ } else if (CreateGatherScatter) {
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
Value *VectorGep = State.get(getAddr(), Part);
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
@@ -9375,7 +9512,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *NewLI;
- if (CreateGatherScatter) {
+ // TODO: split this into several classes for better design.
+ if (State.EVL) {
+ assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+ "explicit vector length.");
+ assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
+ VPInstruction::ExplicitVectorLength &&
+ "EVL must be VPInstruction::ExplicitVectorLength.");
+ Value *EVL = State.get(State.EVL, VPIteration(0, 0));
+ // If EVL is not nullptr, then EVL must be a valid value set during plan
+ // creation, possibly default value = whole vector register length. EVL
+ // is created only if TTI prefers predicated vectorization, thus if EVL
+ // is not nullptr it also implies preference for predicated
+ // vectorization.
+ // FIXME: Support reverse loading after vp_reverse is added.
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ NewLI = lowerLoadUsingVectorIntrinsics(
+ Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter),
+ CreateGatherScatter, MaskPart, EVL, Alignment);
+ } else if (CreateGatherScatter) {
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
Value *VectorGep = State.get(getAddr(), Part);
NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 99769540f780..bdd26acfd2f8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1973,7 +1973,7 @@ public:
assert(isa<Instruction>(VL[0]) && "Expected instruction");
unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
constexpr unsigned IntrinsicNumOperands = 2;
- if (auto *CI = dyn_cast<IntrinsicInst>(VL[0]))
+ if (isa<IntrinsicInst>(VL[0]))
NumOperands = IntrinsicNumOperands;
OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size();
@@ -14141,6 +14141,16 @@ bool BoUpSLP::collectValuesToDemote(
}))
return FinalAnalysis();
+ if (!all_of(I->users(),
+ [=](User *U) {
+ return getTreeEntry(U) ||
+ (UserIgnoreList && UserIgnoreList->contains(U)) ||
+ (U->getType()->isSized() &&
+ DL->getTypeSizeInBits(U->getType()) <= BitWidth);
+ }) &&
+ !IsPotentiallyTruncated(I, BitWidth))
+ return false;
+
unsigned Start = 0;
unsigned End = I->getNumOperands();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f0b7008992d7..8ebd75da3465 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -871,13 +871,15 @@ void VPlan::execute(VPTransformState *State) {
// only a single part is generated, which provides the last part from the
// previous iteration. For non-ordered reductions all UF parts are
// generated.
- bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
- isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
- (isa<VPReductionPHIRecipe>(PhiR) &&
- cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
- bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
- (isa<VPReductionPHIRecipe>(PhiR) &&
- cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+ bool SinglePartNeeded =
+ isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+ isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+ (isa<VPReductionPHIRecipe>(PhiR) &&
+ cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
+ bool NeedsScalar =
+ isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+ (isa<VPReductionPHIRecipe>(PhiR) &&
+ cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 813ebda29ffd..77577b516ae2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -242,6 +242,15 @@ struct VPTransformState {
ElementCount VF;
unsigned UF;
+ /// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid
+ /// value set during plan transformation, possibly a default value = whole
+ /// vector register length. EVL is created only if TTI prefers predicated
+ /// vectorization, thus if EVL is not nullptr it also implies preference for
+ /// predicated vectorization.
+ /// TODO: this is a temporarily solution, the EVL must be explicitly used by
+ /// the recipes and must be removed here.
+ VPValue *EVL = nullptr;
+
/// Hold the indices to generate specific scalar instructions. Null indicates
/// that all instances are to be generated, using either scalar or vector
/// instructions.
@@ -1159,6 +1168,7 @@ public:
SLPLoad,
SLPStore,
ActiveLaneMask,
+ ExplicitVectorLength,
CalculateTripCountMinusVF,
// Increment the canonical IV separately for each unrolled part.
CanonicalIVIncrementForPart,
@@ -2489,6 +2499,45 @@ public:
#endif
};
+/// A recipe for generating the phi node for the current index of elements,
+/// adjusted in accordance with EVL value. It starts at the start value of the
+/// canonical induction and gets incremented by EVL in each iteration of the
+/// vector loop.
+class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
+public:
+ VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL)
+ : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {}
+
+ ~VPEVLBasedIVPHIRecipe() override = default;
+
+ VPEVLBasedIVPHIRecipe *clone() override {
+ llvm_unreachable("cloning not implemented yet");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC)
+
+ static inline bool classof(const VPHeaderPHIRecipe *D) {
+ return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC;
+ }
+
+ /// Generate phi for handling IV based on EVL over iterations correctly.
+ /// TODO: investigate if it can share the code with VPCanonicalIVPHIRecipe.
+ void execute(VPTransformState &State) override;
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A Recipe for widening the canonical induction variable of the vector loop.
class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
public:
@@ -2522,8 +2571,8 @@ public:
}
};
-/// A recipe for converting the canonical IV value to the corresponding value of
-/// an IV with different start and step values, using Start + CanonicalIV *
+/// A recipe for converting the input value \p IV value to the corresponding
+/// value of an IV with different start and step values, using Start + IV *
/// Step.
class VPDerivedIVRecipe : public VPSingleDefRecipe {
/// Kind of the induction.
@@ -2541,16 +2590,16 @@ public:
Start, CanonicalIV, Step) {}
VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
- const FPMathOperator *FPBinOp, VPValue *Start,
- VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
- : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
- Kind(Kind), FPBinOp(FPBinOp) {}
+ const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV,
+ VPValue *Step)
+ : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind),
+ FPBinOp(FPBinOp) {}
~VPDerivedIVRecipe() override = default;
VPRecipeBase *clone() override {
- return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(),
- getCanonicalIV(), getStepValue());
+ return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), getOperand(1),
+ getStepValue());
}
VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)
@@ -2570,9 +2619,6 @@ public:
}
VPValue *getStartValue() const { return getOperand(0); }
- VPCanonicalIVPHIRecipe *getCanonicalIV() const {
- return cast<VPCanonicalIVPHIRecipe>(getOperand(1));
- }
VPValue *getStepValue() const { return getOperand(2); }
/// Returns true if the recipe only uses the first lane of operand \p Op.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 04e30312dc23..c8ae2ee5a30f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -216,14 +216,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
Type *ResultTy =
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
.Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
- VPReductionPHIRecipe, VPWidenPointerInductionRecipe>(
- [this](const auto *R) {
- // Handle header phi recipes, except VPWienIntOrFpInduction
- // which needs special handling due it being possibly truncated.
- // TODO: consider inferring/caching type of siblings, e.g.,
- // backedge value, here and in cases below.
- return inferScalarType(R->getStartValue());
- })
+ VPReductionPHIRecipe, VPWidenPointerInductionRecipe,
+ VPEVLBasedIVPHIRecipe>([this](const auto *R) {
+ // Handle header phi recipes, except VPWidenIntOrFpInduction
+ // which needs special handling due it being possibly truncated.
+ // TODO: consider inferring/caching type of siblings, e.g.,
+ // backedge value, here and in cases below.
+ return inferScalarType(R->getStartValue());
+ })
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
[](const auto *R) { return R->getScalarType(); })
.Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 124ae3108d8a..1be0287ce7c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -286,6 +286,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ComputeReductionResult:
case VPInstruction::PtrAdd:
+ case VPInstruction::ExplicitVectorLength:
return true;
default:
return false;
@@ -386,6 +387,33 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
return Builder.CreateSelect(Cmp, Sub, Zero);
}
+ case VPInstruction::ExplicitVectorLength: {
+ // Compute EVL
+ auto GetEVL = [=](VPTransformState &State, Value *AVL) {
+ assert(AVL->getType()->isIntegerTy() &&
+ "Requested vector length should be an integer.");
+
+ // TODO: Add support for MaxSafeDist for correct loop emission.
+ assert(State.VF.isScalable() && "Expected scalable vector factor.");
+ Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+
+ Value *EVL = State.Builder.CreateIntrinsic(
+ State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
+ {AVL, VFArg, State.Builder.getTrue()});
+ return EVL;
+ };
+ // TODO: Restructure this code with an explicit remainder loop, vsetvli can
+ // be outside of the main loop.
+ assert(Part == 0 && "No unrolling expected for predicated vectorization.");
+ // Compute VTC - IV as the AVL (requested vector length).
+ Value *Index = State.get(getOperand(0), VPIteration(0, 0));
+ Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
+ Value *AVL = State.Builder.CreateSub(TripCount, Index);
+ Value *EVL = GetEVL(State, AVL);
+ assert(!State.EVL && "multiple EVL recipes");
+ State.EVL = this;
+ return EVL;
+ }
case VPInstruction::CanonicalIVIncrementForPart: {
auto *IV = State.get(getOperand(0), VPIteration(0, 0));
if (Part == 0)
@@ -592,6 +620,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
case VPInstruction::ActiveLaneMask:
+ case VPInstruction::ExplicitVectorLength:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::BranchOnCount:
@@ -628,6 +657,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ActiveLaneMask:
O << "active lane mask";
break;
+ case VPInstruction::ExplicitVectorLength:
+ O << "EXPLICIT-VECTOR-LENGTH";
+ break;
case VPInstruction::FirstOrderRecurrenceSplice:
O << "first-order splice";
break;
@@ -1184,7 +1216,7 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
O << Indent << "= DERIVED-IV ";
getStartValue()->printAsOperand(O, SlotTracker);
O << " + ";
- getCanonicalIV()->printAsOperand(O, SlotTracker);
+ getOperand(1)->printAsOperand(O, SlotTracker);
O << " * ";
getStepValue()->printAsOperand(O, SlotTracker);
}
@@ -1974,3 +2006,25 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
printOperands(O, SlotTracker);
}
#endif
+
+void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
+ Value *Start = State.get(getOperand(0), VPIteration(0, 0));
+ PHINode *EntryPart =
+ State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
+ EntryPart->addIncoming(Start, VectorPH);
+ EntryPart->setDebugLoc(getDebugLoc());
+ State.set(this, EntryPart, 0, /*IsScalar=*/true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
+
+ printAsOperand(O, SlotTracker);
+ O << " = phi ";
+ printOperands(O, SlotTracker);
+}
+#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3753060cd6ec..1256e4d8fda5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -641,6 +641,25 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
}
}
+static void recursivelyDeleteDeadRecipes(VPValue *V) {
+ SmallVector<VPValue *> WorkList;
+ SmallPtrSet<VPValue *, 8> Seen;
+ WorkList.push_back(V);
+
+ while (!WorkList.empty()) {
+ VPValue *Cur = WorkList.pop_back_val();
+ if (!Seen.insert(Cur).second)
+ continue;
+ VPRecipeBase *R = Cur->getDefiningRecipe();
+ if (!R)
+ continue;
+ if (!isDeadRecipe(*R))
+ continue;
+ WorkList.append(R->op_begin(), R->op_end());
+ R->eraseFromParent();
+ }
+}
+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
@@ -674,7 +693,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
auto *BOC =
new VPInstruction(VPInstruction::BranchOnCond,
{Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))});
+
+ SmallVector<VPValue *> PossiblyDead(Term->operands());
Term->eraseFromParent();
+ for (VPValue *Op : PossiblyDead)
+ recursivelyDeleteDeadRecipes(Op);
ExitingVPBB->appendRecipe(BOC);
Plan.setVF(BestVF);
Plan.setUF(BestUF);
@@ -1186,6 +1209,45 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
return LaneMaskPhi;
}
+/// Replaces (ICMP_ULE, WideCanonicalIV, backedge-taken-count) pattern using
+/// the given \p Idiom.
+static void
+replaceHeaderPredicateWith(VPlan &Plan, VPValue &Idiom,
+ function_ref<bool(VPUser &, unsigned)> Cond = {}) {
+ auto *FoundWidenCanonicalIVUser =
+ find_if(Plan.getCanonicalIV()->users(),
+ [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+ if (FoundWidenCanonicalIVUser == Plan.getCanonicalIV()->users().end())
+ return;
+ auto *WideCanonicalIV =
+ cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
+ // Walk users of WideCanonicalIV and replace all compares of the form
+ // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with
+ // the given idiom VPValue.
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
+ auto *CompareToReplace = dyn_cast<VPInstruction>(U);
+ if (!CompareToReplace ||
+ CompareToReplace->getOpcode() != Instruction::ICmp ||
+ CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
+ CompareToReplace->getOperand(1) != BTC)
+ continue;
+
+ assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
+ "WidenCanonicalIV must be the first operand of the compare");
+ if (Cond) {
+ CompareToReplace->replaceUsesWithIf(&Idiom, Cond);
+ if (!CompareToReplace->getNumUsers())
+ CompareToReplace->eraseFromParent();
+ } else {
+ CompareToReplace->replaceAllUsesWith(&Idiom);
+ CompareToReplace->eraseFromParent();
+ }
+ }
+ if (!WideCanonicalIV->getNumUsers())
+ WideCanonicalIV->eraseFromParent();
+}
+
void VPlanTransforms::addActiveLaneMask(
VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
bool DataAndControlFlowWithoutRuntimeCheck) {
@@ -1215,20 +1277,77 @@ void VPlanTransforms::addActiveLaneMask(
// Walk users of WideCanonicalIV and replace all compares of the form
// (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
// active-lane-mask.
- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
- for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
- auto *CompareToReplace = dyn_cast<VPInstruction>(U);
- if (!CompareToReplace ||
- CompareToReplace->getOpcode() != Instruction::ICmp ||
- CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
- CompareToReplace->getOperand(1) != BTC)
- continue;
+ replaceHeaderPredicateWith(Plan, *LaneMask);
+}
- assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
- "WidenCanonicalIV must be the first operand of the compare");
- CompareToReplace->replaceAllUsesWith(LaneMask);
- CompareToReplace->eraseFromParent();
+/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
+/// replaces all uses except the canonical IV increment of
+/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
+/// is used only for loop iterations counting after this transformation.
+///
+/// The function uses the following definitions:
+/// %StartV is the canonical induction start value.
+///
+/// The function adds the following recipes:
+///
+/// vector.ph:
+/// ...
+///
+/// vector.body:
+/// ...
+/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
+/// [ %NextEVLIV, %vector.body ]
+/// %VPEVL = EXPLICIT-VECTOR-LENGTH %EVLPhi, original TC
+/// ...
+/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
+/// ...
+///
+void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
+ VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ auto *CanonicalIVPHI = Plan.getCanonicalIV();
+ VPValue *StartV = CanonicalIVPHI->getStartValue();
+
+ // TODO: revisit this and try to remove the mask operand.
+ // Walk VPWidenMemoryInstructionRecipe users of WideCanonicalIV and replace
+ // all compares of the form (ICMP_ULE, WideCanonicalIV, backedge-taken-count),
+ // used as mask in VPWidenMemoryInstructionRecipe, with an all-true-mask.
+ Value *TrueMask =
+ ConstantInt::getTrue(CanonicalIVPHI->getScalarType()->getContext());
+ VPValue *VPTrueMask = Plan.getOrAddLiveIn(TrueMask);
+ replaceHeaderPredicateWith(Plan, *VPTrueMask, [](VPUser &U, unsigned) {
+ return isa<VPWidenMemoryInstructionRecipe>(U);
+ });
+ // Now create the ExplicitVectorLengthPhi recipe in the main loop.
+ auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
+ EVLPhi->insertAfter(CanonicalIVPHI);
+ auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
+ {EVLPhi, Plan.getTripCount()});
+ VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
+
+ auto *CanonicalIVIncrement =
+ cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
+ VPSingleDefRecipe *OpVPEVL = VPEVL;
+ if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
+ IVSize != 32) {
+ OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc
+ : Instruction::ZExt,
+ OpVPEVL, CanonicalIVPHI->getScalarType());
+ OpVPEVL->insertBefore(CanonicalIVIncrement);
}
+ auto *NextEVLIV =
+ new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi},
+ {CanonicalIVIncrement->hasNoUnsignedWrap(),
+ CanonicalIVIncrement->hasNoSignedWrap()},
+ CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
+ NextEVLIV->insertBefore(CanonicalIVIncrement);
+ EVLPhi->addOperand(NextEVLIV);
+
+ // Replace all uses of VPCanonicalIVPHIRecipe by
+ // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
+ CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
+ CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
+ // TODO: support unroll factor > 1.
+ Plan.setUF(1);
}
void VPlanTransforms::dropPoisonGeneratingRecipes(
@@ -1254,9 +1373,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
// handled.
if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
isa<VPInterleaveRecipe>(CurRec) ||
- isa<VPScalarIVStepsRecipe>(CurRec) ||
- isa<VPCanonicalIVPHIRecipe>(CurRec) ||
- isa<VPActiveLaneMaskPHIRecipe>(CurRec))
+ isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec))
continue;
// This recipe contributes to the address computation of a widen
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ff83c3f083b0..0cbc70713d9c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -98,6 +98,13 @@ struct VPlanTransforms {
/// VPlan directly.
static void dropPoisonGeneratingRecipes(
VPlan &Plan, function_ref<bool(BasicBlock *)> BlockNeedsPredication);
+
+ /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
+ /// replaces all uses except the canonical IV increment of
+ /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
+ /// VPCanonicalIVPHIRecipe is only used to control the loop after
+ /// this transformation.
+ static void addExplicitVectorLength(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 1d2c17e91b7a..8b221d30e525 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -368,6 +368,7 @@ public:
// VPHeaderPHIRecipe need to be kept together.
VPCanonicalIVPHISC,
VPActiveLaneMaskPHISC,
+ VPEVLBasedIVPHISC,
VPFirstOrderRecurrencePHISC,
VPWidenIntOrFpInductionSC,
VPWidenPointerInductionSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7ebdb914fb85..12d37fa711db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -92,7 +92,50 @@ static bool verifyVPBasicBlock(const VPBasicBlock *VPBB,
for (const VPRecipeBase &R : *VPBB)
RecipeNumbering[&R] = Cnt++;
+ // Set of recipe types along with VPInstruction Opcodes of all EVL-related
+ // recipes that must appear at most once in the header block.
+ DenseSet<unsigned> EVLFound;
+ const VPRecipeBase *VPWidenMemRecipe = nullptr;
+ const VPlan *Plan = VPBB->getPlan();
+ bool IsHeader = Plan->getEntry()->getNumSuccessors() == 1 &&
+ Plan->getVectorLoopRegion()->getEntry() == VPBB;
+ auto CheckEVLRecipiesInsts = [&](const VPRecipeBase *R) {
+ if (isa<VPEVLBasedIVPHIRecipe>(R)) {
+ if (!IsHeader) {
+ errs() << "EVL PHI recipe not in entry block!\n";
+ return false;
+ }
+ if (!EVLFound.insert(VPDef::VPEVLBasedIVPHISC).second) {
+ errs() << "EVL PHI recipe inserted more than once!\n";
+ return false;
+ }
+ return true;
+ }
+ if (const auto *RInst = dyn_cast<VPInstruction>(R);
+ RInst && RInst->getOpcode() == VPInstruction::ExplicitVectorLength) {
+ if (!IsHeader) {
+ errs() << "EVL instruction not in the header block!\n";
+ return false;
+ }
+ if (!EVLFound.insert(RInst->getOpcode() + VPDef::VPLastPHISC).second) {
+ errs() << "EVL instruction inserted more than once!\n";
+ return false;
+ }
+ if (VPWidenMemRecipe) {
+ errs() << "Use of EVL instruction by widen memory recipe before "
+ "definition!\n";
+ return false;
+ }
+ return true;
+ }
+ if (isa<VPWidenMemoryInstructionRecipe>(R))
+ VPWidenMemRecipe = R;
+ return true;
+ };
+
for (const VPRecipeBase &R : *VPBB) {
+ if (!CheckEVLRecipiesInsts(&R))
+ return false;
for (const VPValue *V : R.definedValues()) {
for (const VPUser *U : V->users()) {
auto *UI = dyn_cast<VPRecipeBase>(U);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index af5e7c9bc385..3738220b4f81 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -112,6 +112,7 @@ private:
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
bool foldShuffleOfBinops(Instruction &I);
+ bool foldShuffleOfCastops(Instruction &I);
bool foldShuffleFromReductions(Instruction &I);
bool foldTruncFromReductions(Instruction &I);
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -1432,6 +1433,75 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
return true;
}
+/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
+/// into "castop (shuffle)".
+bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
+ Value *V0, *V1;
+ ArrayRef<int> Mask;
+ if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
+ m_Mask(Mask))))
+ return false;
+
+ auto *C0 = dyn_cast<CastInst>(V0);
+ auto *C1 = dyn_cast<CastInst>(V1);
+ if (!C0 || !C1)
+ return false;
+
+ Instruction::CastOps Opcode = C0->getOpcode();
+ if (Opcode == Instruction::BitCast || C0->getSrcTy() != C1->getSrcTy())
+ return false;
+
+ // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
+ if (Opcode != C1->getOpcode()) {
+ if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
+ Opcode = Instruction::SExt;
+ else
+ return false;
+ }
+
+ auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
+ auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
+ auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
+ if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
+ return false;
+ assert(CastDstTy->getElementCount() == CastSrcTy->getElementCount() &&
+ "Unexpected src/dst element counts");
+
+ auto *NewShuffleDstTy =
+ FixedVectorType::get(CastSrcTy->getScalarType(), Mask.size());
+
+ // Try to replace a castop with a shuffle if the shuffle is not costly.
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ InstructionCost OldCost =
+ TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
+ TTI::CastContextHint::None, CostKind) +
+ TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
+ TTI::CastContextHint::None, CostKind);
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+ CastDstTy, Mask, CostKind);
+
+ InstructionCost NewCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, Mask, CostKind);
+ NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
+ TTI::CastContextHint::None, CostKind);
+ if (NewCost > OldCost)
+ return false;
+
+ Value *Shuf =
+ Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0), Mask);
+ Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
+
+ // Intersect flags from the old casts.
+ if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
+ NewInst->copyIRFlags(C0);
+ NewInst->andIRFlags(C1);
+ }
+
+ replaceValue(I, *Cast);
+ return true;
+}
+
/// Given a commutative reduction, the order of the input lanes does not alter
/// the results. We can use this to remove certain shuffles feeding the
/// reduction, removing the need to shuffle at all.
@@ -1986,6 +2056,7 @@ bool VectorCombine::run() {
break;
case Instruction::ShuffleVector:
MadeChange |= foldShuffleOfBinops(I);
+ MadeChange |= foldShuffleOfCastops(I);
MadeChange |= foldSelectShuffle(I);
break;
case Instruction::BitCast: