summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
authorMichael Kruse <llvm-project@meinersbur.de>2025-01-03 10:22:51 +0100
committerMichael Kruse <llvm-project@meinersbur.de>2025-01-03 10:22:51 +0100
commit38500d63e14ce340236840f60d356cdefb56a52c (patch)
tree17edbec446ce9b50d2f215a483b83afb293a635d /llvm/lib/Transforms
parent1a3d5daaef7a6a63448a497da3eff7fc9e23df26 (diff)
parent27f30029741ecf023baece7b3dde1ff9011ffefc (diff)
Merge branch 'main' into users/meinersbur/flang_runtime_split-headersusers/meinersbur/flang_runtime_split-headers
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/AlwaysInliner.cpp42
-rw-r--r--llvm/lib/Transforms/IPO/FunctionAttrs.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp106
-rw-r--r--llvm/lib/Transforms/IPO/SampleProfile.cpp24
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp18
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp46
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp21
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp74
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp146
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp6
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp257
-rw-r--r--llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp141
-rw-r--r--llvm/lib/Transforms/Instrumentation/CMakeLists.txt2
-rw-r--r--llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp64
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfiler.cpp165
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp17
-rw-r--r--llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp358
-rw-r--r--llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp28
-rw-r--r--llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp6
-rw-r--r--llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp898
-rw-r--r--llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp153
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp29
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/EarlyCSE.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/GuardWidening.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp100
-rw-r--r--llvm/lib/Transforms/Scalar/JumpThreading.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp11
-rw-r--r--llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp13
-rw-r--r--llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp78
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp111
-rw-r--r--llvm/lib/Transforms/Scalar/Scalarizer.cpp24
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp10
-rw-r--r--llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp11
-rw-r--r--llvm/lib/Transforms/Scalar/StructurizeCFG.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/BuildLibCalls.cpp27
-rw-r--r--llvm/lib/Transforms/Utils/CallPromotionUtils.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/CloneFunction.cpp218
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp19
-rw-r--r--llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp15
-rw-r--r--llvm/lib/Transforms/Utils/Evaluator.cpp40
-rw-r--r--llvm/lib/Transforms/Utils/FunctionComparator.cpp29
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp30
-rw-r--r--llvm/lib/Transforms/Utils/LoopPeel.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/LoopRotationUtils.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/LoopSimplify.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp19
-rw-r--r--llvm/lib/Transforms/Utils/LoopVersioning.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp10
-rw-r--r--llvm/lib/Transforms/Utils/SSAUpdater.cpp14
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp243
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyIndVar.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp82
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp38
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h17
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp1046
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp880
-rw-r--r--llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp107
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp272
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h390
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h4
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h16
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp231
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp580
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h13
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp4
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp650
79 files changed, 5080 insertions, 2975 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 45ee2d472a11..12ae6740e055 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -181,6 +181,7 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
/// of 'and' ops, then we also need to capture the fact that we saw an
/// "and X, 1", so that's an extra return value for that case.
+namespace {
struct MaskOps {
Value *Root = nullptr;
APInt Mask;
@@ -190,6 +191,7 @@ struct MaskOps {
MaskOps(unsigned BitWidth, bool MatchAnds)
: Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {}
};
+} // namespace
/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
/// chain of 'and' or 'or' instructions looking for shift ops of a common source
diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index 0baa34d50abf..20fc630a74a8 100644
--- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -32,10 +32,9 @@ namespace {
bool AlwaysInlineImpl(
Module &M, bool InsertLifetime, ProfileSummaryInfo &PSI,
+ FunctionAnalysisManager *FAM,
function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
- function_ref<AAResults &(Function &)> GetAAR,
- function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
- function_ref<BlockFrequencyInfo *(Function &)> GetCachedBFI) {
+ function_ref<AAResults &(Function &)> GetAAR) {
SmallSetVector<CallBase *, 16> Calls;
bool Changed = false;
SmallVector<Function *, 16> InlinedComdatFunctions;
@@ -62,12 +61,7 @@ bool AlwaysInlineImpl(
DebugLoc DLoc = CB->getDebugLoc();
BasicBlock *Block = CB->getParent();
- // Only update CallerBFI if already available. The CallerBFI update
- // requires CalleeBFI.
- BlockFrequencyInfo *CallerBFI = GetCachedBFI(*Caller);
- InlineFunctionInfo IFI(GetAssumptionCache, &PSI, CallerBFI,
- CallerBFI ? &GetBFI(F) : nullptr);
-
+ InlineFunctionInfo IFI(GetAssumptionCache, &PSI, nullptr, nullptr);
InlineResult Res = InlineFunction(*CB, IFI, /*MergeAttributes=*/true,
&GetAAR(F), InsertLifetime);
if (!Res.isSuccess()) {
@@ -86,6 +80,8 @@ bool AlwaysInlineImpl(
/*ForProfileContext=*/false, DEBUG_TYPE);
Changed = true;
+ if (FAM)
+ FAM->invalidate(*Caller, PreservedAnalyses::none());
}
F.removeDeadConstantUsers();
@@ -95,6 +91,8 @@ bool AlwaysInlineImpl(
if (F.hasComdat()) {
InlinedComdatFunctions.push_back(&F);
} else {
+ if (FAM)
+ FAM->clear(F, F.getName());
M.getFunctionList().erase(F);
Changed = true;
}
@@ -107,6 +105,8 @@ bool AlwaysInlineImpl(
filterDeadComdatFunctions(InlinedComdatFunctions);
// The remaining functions are actually dead.
for (Function *F : InlinedComdatFunctions) {
+ if (FAM)
+ FAM->clear(*F, F->getName());
M.getFunctionList().erase(F);
Changed = true;
}
@@ -136,12 +136,9 @@ struct AlwaysInlinerLegacyPass : public ModulePass {
auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
};
- auto GetCachedBFI = [](Function &) -> BlockFrequencyInfo * {
- return nullptr;
- };
- return AlwaysInlineImpl(M, InsertLifetime, PSI, GetAssumptionCache, GetAAR,
- /*GetBFI=*/nullptr, GetCachedBFI);
+ return AlwaysInlineImpl(M, InsertLifetime, PSI, /*FAM=*/nullptr,
+ GetAssumptionCache, GetAAR);
}
static char ID; // Pass identification, replacement for typeid
@@ -175,19 +172,18 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
return FAM.getResult<AssumptionAnalysis>(F);
};
- auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
- return FAM.getResult<BlockFrequencyAnalysis>(F);
- };
- auto GetCachedBFI = [&](Function &F) -> BlockFrequencyInfo * {
- return FAM.getCachedResult<BlockFrequencyAnalysis>(F);
- };
auto GetAAR = [&](Function &F) -> AAResults & {
return FAM.getResult<AAManager>(F);
};
auto &PSI = MAM.getResult<ProfileSummaryAnalysis>(M);
- bool Changed = AlwaysInlineImpl(M, InsertLifetime, PSI, GetAssumptionCache,
- GetAAR, GetBFI, GetCachedBFI);
+ bool Changed = AlwaysInlineImpl(M, InsertLifetime, PSI, &FAM,
+ GetAssumptionCache, GetAAR);
+ if (!Changed)
+ return PreservedAnalyses::all();
- return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ // We have already invalidated all analyses on modified functions.
+ PA.preserveSet<AllAnalysesOn<Function>>();
+ return PA;
}
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index afb0ea72b269..fe9cca01a8f3 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -633,7 +633,7 @@ ArgumentAccessInfo getArgmentAccessInfo(const Instruction *I,
[](Value *Length,
std::optional<int64_t> Offset) -> std::optional<ConstantRange> {
auto *ConstantLength = dyn_cast<ConstantInt>(Length);
- if (ConstantLength && Offset)
+ if (ConstantLength && Offset && !ConstantLength->isNegative())
return ConstantRange(
APInt(64, *Offset, true),
APInt(64, *Offset + ConstantLength->getSExtValue(), true));
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index ea92c6e2f59e..1bf7ff468d78 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -140,6 +140,7 @@ cl::opt<bool> MemProfRequireDefinitionForPromotion(
} // namespace llvm
extern cl::opt<bool> MemProfReportHintedSizes;
+extern cl::opt<unsigned> MinClonedColdBytePercent;
namespace {
/// CRTP base for graphs built from either IR or ThinLTO summary index.
@@ -617,6 +618,11 @@ private:
static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
}
+ /// Get the AllocationType assigned to the given allocation instruction clone.
+ AllocationType getAllocationCallType(const CallInfo &Call) const {
+ return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
+ }
+
/// Update non-allocation call to invoke (possibly cloned) function
/// CalleeFunc.
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
@@ -711,7 +717,8 @@ private:
/// Map from each contextID to the profiled full contexts and their total
/// sizes (there may be more than one due to context trimming),
- /// optionally populated when requested (via MemProfReportHintedSizes).
+ /// optionally populated when requested (via MemProfReportHintedSizes or
+ /// MinClonedColdBytePercent).
DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
/// Identifies the context node created for a stack id when adding the MIB
@@ -773,6 +780,7 @@ private:
uint64_t getLastStackId(Instruction *Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+ AllocationType getAllocationCallType(const CallInfo &Call) const;
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
@@ -852,6 +860,7 @@ private:
uint64_t getLastStackId(IndexCall &Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+ AllocationType getAllocationCallType(const CallInfo &Call) const;
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
@@ -896,21 +905,6 @@ struct DenseMapInfo<IndexCall>
namespace {
-struct FieldSeparator {
- bool Skip = true;
- const char *Sep;
-
- FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
-};
-
-raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
- if (FS.Skip) {
- FS.Skip = false;
- return OS;
- }
- return OS << FS.Sep;
-}
-
// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
// type we should actually use on the corresponding allocation.
// If we can't clone a node that has NotCold+Cold alloc type, we will fall
@@ -1216,8 +1210,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
ContextIdToAllocationType[++LastContextId] = AllocType;
- if (MemProfReportHintedSizes) {
- assert(!ContextSizeInfo.empty());
+ if (!ContextSizeInfo.empty()) {
auto &Entry = ContextIdToContextSizeInfos[LastContextId];
Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
}
@@ -2058,14 +2051,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
EmptyContext;
unsigned I = 0;
- assert(!MemProfReportHintedSizes ||
- AN.ContextSizeInfos.size() == AN.MIBs.size());
+ assert(
+ (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) ||
+ AN.ContextSizeInfos.size() == AN.MIBs.size());
// Now add all of the MIBs and their stack nodes.
for (auto &MIB : AN.MIBs) {
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
StackContext(&MIB);
std::vector<ContextTotalSize> ContextSizeInfo;
- if (MemProfReportHintedSizes) {
+ if (!AN.ContextSizeInfos.empty()) {
for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
ContextSizeInfo.push_back({FullStackId, TotalSize});
}
@@ -2784,9 +2778,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
OS << "\t\t" << *Edge << "\n";
if (!Clones.empty()) {
OS << "\tClones: ";
- FieldSeparator FS;
+ ListSeparator LS;
for (auto *Clone : Clones)
- OS << FS << Clone;
+ OS << LS << Clone;
OS << "\n";
} else if (CloneOf) {
OS << "\tClone of " << CloneOf << "\n";
@@ -2840,6 +2834,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
if (!Node->IsAllocation)
continue;
DenseSet<uint32_t> ContextIds = Node->getContextIds();
+ auto AllocTypeFromCall = getAllocationCallType(Node->Call);
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds) {
@@ -2852,7 +2847,11 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
<< getAllocTypeString((uint8_t)TypeI->second)
<< " full allocation context " << Info.FullStackId
<< " with total size " << Info.TotalSize << " is "
- << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
+ << getAllocTypeString(Node->AllocTypes) << " after cloning";
+ if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
+ OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
+ << " due to cold byte percent";
+ OS << "\n";
}
}
}
@@ -3384,6 +3383,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
break;
+ // If the caller was not successfully matched to a call in the IR/summary,
+ // there is no point in trying to clone for it as we can't update that call.
+ if (!CallerEdge->Caller->hasCall()) {
+ ++EI;
+ continue;
+ }
+
// Only need to process the ids along this edge pertaining to the given
// allocation.
auto CallerEdgeContextsForAlloc =
@@ -3495,6 +3501,23 @@ void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
}
+AllocationType
+ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
+ const auto *CB = cast<CallBase>(Call.call());
+ if (!CB->getAttributes().hasFnAttr("memprof"))
+ return AllocationType::None;
+ return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
+ ? AllocationType::Cold
+ : AllocationType::NotCold;
+}
+
+AllocationType
+IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
+ const auto *AI = Call.call().dyn_cast<AllocInfo *>();
+ assert(AI->Versions.size() > Call.cloneNo());
+ return (AllocationType)AI->Versions[Call.cloneNo()];
+}
+
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
if (CalleeFunc.cloneNo() > 0)
@@ -4025,6 +4048,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
}
}
+ uint8_t BothTypes =
+ (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
+
auto UpdateCalls = [&](ContextNode *Node,
DenseSet<const ContextNode *> &Visited,
auto &&UpdateCalls) {
@@ -4044,7 +4070,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
return;
if (Node->IsAllocation) {
- updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
+ auto AT = allocTypeToUse(Node->AllocTypes);
+ // If the allocation type is ambiguous, and more aggressive hinting
+ // has been enabled via the MinClonedColdBytePercent flag, see if this
+ // allocation should be hinted cold anyway because its fraction cold bytes
+ // allocated is at least the given threshold.
+ if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
+ !ContextIdToContextSizeInfos.empty()) {
+ uint64_t TotalCold = 0;
+ uint64_t Total = 0;
+ for (auto Id : Node->getContextIds()) {
+ auto TypeI = ContextIdToAllocationType.find(Id);
+ assert(TypeI != ContextIdToAllocationType.end());
+ auto CSI = ContextIdToContextSizeInfos.find(Id);
+ if (CSI != ContextIdToContextSizeInfos.end()) {
+ for (auto &Info : CSI->second) {
+ Total += Info.TotalSize;
+ if (TypeI->second == AllocationType::Cold)
+ TotalCold += Info.TotalSize;
+ }
+ }
+ }
+ if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
+ AT = AllocationType::Cold;
+ }
+ updateAllocationCall(Node->Call, AT);
assert(Node->MatchingCalls.empty());
return;
}
@@ -4427,7 +4477,11 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// will still be none type or should have gotten the default NotCold.
// Skip that after calling clone helper since that does some sanity
// checks that confirm we haven't decided yet that we need cloning.
- if (AllocNode.Versions.size() == 1) {
+ // We might have a single version that is cold due to the
+ // MinClonedColdBytePercent heuristic, make sure we don't skip in that
+ // case.
+ if (AllocNode.Versions.size() == 1 &&
+ (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
assert((AllocationType)AllocNode.Versions[0] ==
AllocationType::NotCold ||
(AllocationType)AllocNode.Versions[0] ==
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index b2fa66f2a6d3..603beb3b883d 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -529,7 +529,7 @@ protected:
void generateMDProfMetadata(Function &F);
bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI,
const SampleProfileMap &Profiles);
- void removePseudoProbeInsts(Module &M);
+ void removePseudoProbeInstsDiscriminator(Module &M);
/// Map from function name to Function *. Used to find the function from
/// the function name. If the function name contains suffix, additional
@@ -2138,13 +2138,25 @@ bool SampleProfileLoader::rejectHighStalenessProfile(
return false;
}
-void SampleProfileLoader::removePseudoProbeInsts(Module &M) {
+void SampleProfileLoader::removePseudoProbeInstsDiscriminator(Module &M) {
for (auto &F : M) {
std::vector<Instruction *> InstsToDel;
for (auto &BB : F) {
for (auto &I : BB) {
if (isa<PseudoProbeInst>(&I))
InstsToDel.push_back(&I);
+ else if (isa<CallBase>(&I))
+ if (const DILocation *DIL = I.getDebugLoc().get()) {
+ // Restore dwarf discriminator for call.
+ unsigned Discriminator = DIL->getDiscriminator();
+ if (DILocation::isPseudoProbeDiscriminator(Discriminator)) {
+ std::optional<uint32_t> DwarfDiscriminator =
+ PseudoProbeDwarfDiscriminator::extractDwarfBaseDiscriminator(
+ Discriminator);
+ I.setDebugLoc(DIL->cloneWithDiscriminator(
+ DwarfDiscriminator ? *DwarfDiscriminator : 0));
+ }
+ }
}
}
for (auto *I : InstsToDel)
@@ -2224,8 +2236,12 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
notInlinedCallInfo)
updateProfileCallee(pair.first, pair.second.entryCount);
- if (RemoveProbeAfterProfileAnnotation && FunctionSamples::ProfileIsProbeBased)
- removePseudoProbeInsts(M);
+ if (RemoveProbeAfterProfileAnnotation &&
+ FunctionSamples::ProfileIsProbeBased) {
+ removePseudoProbeInstsDiscriminator(M);
+ if (auto *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName))
+ M.eraseNamedMetadata(FuncInfo);
+ }
return retval;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index ea7942ef9781..7a184a19d7c5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1289,7 +1289,7 @@ static Instruction *foldAddToAshr(BinaryOperator &Add) {
// Note that, by the time we end up here, if possible, ugt has been
// canonicalized into eq.
const APInt *MaskC, *MaskCCmp;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(Add.getOperand(1),
m_SExt(m_ICmp(Pred, m_And(m_Specific(X), m_APInt(MaskC)),
m_APInt(MaskCCmp)))))
@@ -1382,7 +1382,7 @@ Instruction *InstCombinerImpl::
// `select` itself may be appropriately extended, look past that.
SkipExtInMagic(Select);
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
const APInt *Thr;
Value *SignExtendingValue, *Zero;
bool ShouldSignext;
@@ -1654,7 +1654,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
return replaceInstUsesWith(I, Constant::getNullValue(I.getType()));
// sext(A < B) + zext(A > B) => ucmp/scmp(A, B)
- ICmpInst::Predicate LTPred, GTPred;
+ CmpPredicate LTPred, GTPred;
if (match(&I,
m_c_Add(m_SExt(m_c_ICmp(LTPred, m_Value(A), m_Value(B))),
m_ZExt(m_c_ICmp(GTPred, m_Deferred(A), m_Deferred(B))))) &&
@@ -1841,7 +1841,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
// -->
// BW - ctlz(A - 1, false)
const APInt *XorC;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (match(&I,
m_c_Add(
m_ZExt(m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(A)),
@@ -2280,6 +2280,16 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes()))))
return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X);
+ // if (C1 & C2) == C2 then (X & C1) - (X & C2) -> X & (C1 ^ C2)
+ Constant *C1, *C2;
+ if (match(Op0, m_And(m_Value(X), m_ImmConstant(C1))) &&
+ match(Op1, m_And(m_Specific(X), m_ImmConstant(C2)))) {
+ Value *AndC = ConstantFoldBinaryInstruction(Instruction::And, C1, C2);
+ if (C2->isElementWiseEqual(AndC))
+ return BinaryOperator::CreateAnd(
+ X, ConstantFoldBinaryInstruction(Instruction::Xor, C1, C2));
+ }
+
// Reassociate sub/add sequences to create more add instructions and
// reduce dependency chains:
// ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b4033fc2a418..e576eea4ca36 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -455,14 +455,20 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
// RHS. For example,
// (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
// (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
- if (IsSuperSetOrEqual(BCst, DCst))
+ if (IsSuperSetOrEqual(BCst, DCst)) {
+ // We can't guarantee that samesign hold after this fold.
+ RHS->setSameSign(false);
return RHS;
+ }
// Otherwise, B is a subset of D. If B and E have a common bit set,
// ie. (B & E) != 0, then LHS is subsumed by RHS. For example.
// (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code");
- if ((*BCst & ECst) != 0)
+ if ((*BCst & ECst) != 0) {
+ // We can't guarantee that samesign hold after this fold.
+ RHS->setSameSign(false);
return RHS;
+ }
// Otherwise, LHS and RHS contradict and the whole expression becomes false
// (or true if negated.) For example,
// (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false.
@@ -695,13 +701,17 @@ Value *InstCombinerImpl::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
Cmp1->getPredicate());
Value *Input = Cmp0->getOperand(0);
+ Value *Cmp1Op0 = Cmp1->getOperand(0);
+ Value *Cmp1Op1 = Cmp1->getOperand(1);
Value *RangeEnd;
- if (Cmp1->getOperand(0) == Input) {
+ if (match(Cmp1Op0, m_SExtOrSelf(m_Specific(Input)))) {
// For the upper range compare we have: icmp x, n
- RangeEnd = Cmp1->getOperand(1);
- } else if (Cmp1->getOperand(1) == Input) {
+ Input = Cmp1Op0;
+ RangeEnd = Cmp1Op1;
+ } else if (match(Cmp1Op1, m_SExtOrSelf(m_Specific(Input)))) {
// For the upper range compare we have: icmp n, x
- RangeEnd = Cmp1->getOperand(0);
+ Input = Cmp1Op1;
+ RangeEnd = Cmp1Op0;
Pred1 = ICmpInst::getSwappedPredicate(Pred1);
} else {
return nullptr;
@@ -734,7 +744,7 @@ static Value *
foldAndOrOfICmpsWithPow2AndWithZero(InstCombiner::BuilderTy &Builder,
ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
const SimplifyQuery &Q) {
- CmpInst::Predicate Pred = IsAnd ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+ CmpPredicate Pred = IsAnd ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
// Make sure we have right compares for our op.
if (LHS->getPredicate() != Pred || RHS->getPredicate() != Pred)
return nullptr;
@@ -871,7 +881,7 @@ static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
// Try to match/decompose into: icmp eq (X & Mask), 0
auto tryToDecompose = [](ICmpInst *ICmp, Value *&X,
APInt &UnsetBitsMask) -> bool {
- CmpInst::Predicate Pred = ICmp->getPredicate();
+ CmpPredicate Pred = ICmp->getPredicate();
// Can it be decomposed into icmp eq (X & Mask), 0 ?
auto Res =
llvm::decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1),
@@ -940,7 +950,7 @@ static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
static Value *foldIsPowerOf2OrZero(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd,
InstCombiner::BuilderTy &Builder,
InstCombinerImpl &IC) {
- CmpInst::Predicate Pred0, Pred1;
+ CmpPredicate Pred0, Pred1;
Value *X;
if (!match(Cmp0, m_ICmp(Pred0, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
m_SpecificInt(1))) ||
@@ -1113,12 +1123,12 @@ static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
const SimplifyQuery &Q,
InstCombiner::BuilderTy &Builder) {
Value *ZeroCmpOp;
- ICmpInst::Predicate EqPred;
+ CmpPredicate EqPred;
if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) ||
!ICmpInst::isEquality(EqPred))
return nullptr;
- ICmpInst::Predicate UnsignedPred;
+ CmpPredicate UnsignedPred;
Value *A, *B;
if (match(UnsignedICmp,
@@ -1277,7 +1287,7 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
const SimplifyQuery &Q) {
// Match an equality compare with a non-poison constant as Cmp0.
// Also, give up if the compare can be constant-folded to avoid looping.
- ICmpInst::Predicate Pred0;
+ CmpPredicate Pred0;
Value *X;
Constant *C;
if (!match(Cmp0, m_ICmp(Pred0, m_Value(X), m_Constant(C))) ||
@@ -1291,7 +1301,7 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
// common operand as operand 1 (Pred1 is swapped if the common operand was
// operand 0).
Value *Y;
- ICmpInst::Predicate Pred1;
+ CmpPredicate Pred1;
if (!match(Cmp1, m_c_ICmp(Pred1, m_Value(Y), m_Specific(X))))
return nullptr;
@@ -1322,7 +1332,7 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
Value *InstCombinerImpl::foldAndOrOfICmpsUsingRanges(ICmpInst *ICmp1,
ICmpInst *ICmp2,
bool IsAnd) {
- ICmpInst::Predicate Pred1, Pred2;
+ CmpPredicate Pred1, Pred2;
Value *V1, *V2;
const APInt *C1, *C2;
if (!match(ICmp1, m_ICmp(Pred1, m_Value(V1), m_APInt(C1))) ||
@@ -1344,12 +1354,12 @@ Value *InstCombinerImpl::foldAndOrOfICmpsUsingRanges(ICmpInst *ICmp1,
return nullptr;
ConstantRange CR1 = ConstantRange::makeExactICmpRegion(
- IsAnd ? ICmpInst::getInversePredicate(Pred1) : Pred1, *C1);
+ IsAnd ? ICmpInst::getInverseCmpPredicate(Pred1) : Pred1, *C1);
if (Offset1)
CR1 = CR1.subtract(*Offset1);
ConstantRange CR2 = ConstantRange::makeExactICmpRegion(
- IsAnd ? ICmpInst::getInversePredicate(Pred2) : Pred2, *C2);
+ IsAnd ? ICmpInst::getInverseCmpPredicate(Pred2) : Pred2, *C2);
if (Offset2)
CR2 = CR2.subtract(*Offset2);
@@ -3939,7 +3949,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
return V;
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *Mul, *Ov, *MulIsNotZero, *UMulWithOv;
// Check if the OR weakens the overflow condition for umul.with.overflow by
// treating any non-zero result as overflow. In that case, we overflow if both
@@ -4604,7 +4614,7 @@ Instruction *InstCombinerImpl::foldNot(BinaryOperator &I) {
}
// not (cmp A, B) = !cmp A, B
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (match(NotOp, m_Cmp(Pred, m_Value(), m_Value())) &&
(NotOp->hasOneUse() ||
InstCombiner::canFreelyInvertAllUsersOf(cast<Instruction>(NotOp),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 7221c987b982..0b9379965f42 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -934,6 +934,11 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
}
}
+ if (DestWidth == 1 &&
+ (Trunc.hasNoUnsignedWrap() || Trunc.hasNoSignedWrap()) &&
+ isKnownNonZero(Src, SQ.getWithInstruction(&Trunc)))
+ return replaceInstUsesWith(Trunc, ConstantInt::getTrue(DestTy));
+
bool Changed = false;
if (!Trunc.hasNoSignedWrap() &&
ComputeMaxSignificantBits(Src, /*Depth=*/0, &Trunc) <= DestWidth) {
@@ -1847,15 +1852,16 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
Value *X;
Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0));
if (Op && Op->hasOneUse()) {
- // FIXME: The FMF should propagate from the fptrunc, not the source op.
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- if (isa<FPMathOperator>(Op))
- Builder.setFastMathFlags(Op->getFastMathFlags());
+ FastMathFlags FMF = FPT.getFastMathFlags();
+ if (auto *FPMO = dyn_cast<FPMathOperator>(Op))
+ FMF &= FPMO->getFastMathFlags();
+ Builder.setFastMathFlags(FMF);
if (match(Op, m_FNeg(m_Value(X)))) {
Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
-
- return UnaryOperator::CreateFNegFMF(InnerTrunc, Op);
+ Value *Neg = Builder.CreateFNeg(InnerTrunc);
+ return replaceInstUsesWith(FPT, Neg);
}
// If we are truncating a select that has an extended operand, we can
@@ -2106,10 +2112,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
Base->getType() == Ty) {
Value *Offset = EmitGEPOffset(GEP);
auto *NewOp = BinaryOperator::CreateAdd(Base, Offset);
- if (GEP->hasNoUnsignedWrap() ||
- (GEP->hasNoUnsignedSignedWrap() &&
- isKnownNonNegative(Offset, SQ.getWithInstruction(&CI))))
- NewOp->setHasNoUnsignedWrap(true);
+ NewOp->setHasNoUnsignedWrap(GEP->hasNoUnsignedWrap());
return NewOp;
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 56391d320e8b..d6fdade25559 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -689,13 +689,32 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
if (!isa<GetElementPtrInst>(RHS))
RHS = RHS->stripPointerCasts();
+ auto CanFold = [Cond](GEPNoWrapFlags NW) {
+ if (ICmpInst::isEquality(Cond))
+ return true;
+
+ // Unsigned predicates can be folded if the GEPs have *any* nowrap flags.
+ assert(ICmpInst::isUnsigned(Cond));
+ return NW != GEPNoWrapFlags::none();
+ };
+
+ auto NewICmp = [Cond](GEPNoWrapFlags NW, Value *Op1, Value *Op2) {
+ if (!NW.hasNoUnsignedWrap()) {
+ // Convert signed to unsigned comparison.
+ return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Op1, Op2);
+ }
+
+ auto *I = new ICmpInst(Cond, Op1, Op2);
+ I->setSameSign(NW.hasNoUnsignedSignedWrap());
+ return I;
+ };
+
Value *PtrBase = GEPLHS->getOperand(0);
- if (PtrBase == RHS &&
- (GEPLHS->hasNoUnsignedSignedWrap() || ICmpInst::isEquality(Cond))) {
+ if (PtrBase == RHS && CanFold(GEPLHS->getNoWrapFlags())) {
// ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0).
Value *Offset = EmitGEPOffset(GEPLHS);
- return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
- Constant::getNullValue(Offset->getType()));
+ return NewICmp(GEPLHS->getNoWrapFlags(), Offset,
+ Constant::getNullValue(Offset->getType()));
}
if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) &&
@@ -813,19 +832,18 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
return replaceInstUsesWith(I, // No comparison is needed here.
ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));
- else if (NumDifferences == 1 && NW.hasNoUnsignedSignedWrap()) {
+ else if (NumDifferences == 1 && CanFold(NW)) {
Value *LHSV = GEPLHS->getOperand(DiffOperand);
Value *RHSV = GEPRHS->getOperand(DiffOperand);
- // Make sure we do a signed comparison here.
- return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
+ return NewICmp(NW, LHSV, RHSV);
}
}
- if (NW.hasNoUnsignedSignedWrap() || CmpInst::isEquality(Cond)) {
+ if (CanFold(NW)) {
// ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2)
Value *L = EmitGEPOffset(GEPLHS, /*RewriteGEP=*/true);
Value *R = EmitGEPOffset(GEPRHS, /*RewriteGEP=*/true);
- return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
+ return NewICmp(NW, L, R);
}
}
@@ -1155,7 +1173,7 @@ Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
// This fold is only valid for equality predicates.
if (!I.isEquality())
return nullptr;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *X, *Y, *Zero;
if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))),
m_CombineAnd(m_Zero(), m_Value(Zero)))))
@@ -1172,7 +1190,7 @@ Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
/// by one-less-than-bitwidth into a sign test on the original value.
Instruction *InstCombinerImpl::foldSignBitTest(ICmpInst &I) {
Instruction *Val;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero())))
return nullptr;
@@ -1386,7 +1404,7 @@ Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
};
for (BranchInst *BI : DC.conditionsFor(X)) {
- ICmpInst::Predicate DomPred;
+ CmpPredicate DomPred;
const APInt *DomC;
if (!match(BI->getCondition(),
m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))))
@@ -1499,7 +1517,7 @@ Instruction *
InstCombinerImpl::foldICmpTruncWithTruncOrExt(ICmpInst &Cmp,
const SimplifyQuery &Q) {
Value *X, *Y;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
bool YIsSExt = false;
// Try to match icmp (trunc X), (trunc Y)
if (match(&Cmp, m_ICmp(Pred, m_Trunc(m_Value(X)), m_Trunc(m_Value(Y))))) {
@@ -3231,7 +3249,7 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
// i32 Equal,
// i32 (select i1 (a < b), i32 Less, i32 Greater)
// where Equal, Less and Greater are placeholders for any three constants.
- ICmpInst::Predicate PredA;
+ CmpPredicate PredA;
if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) ||
!ICmpInst::isEquality(PredA))
return false;
@@ -3242,7 +3260,7 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
std::swap(EqualVal, UnequalVal);
if (!match(EqualVal, m_ConstantInt(Equal)))
return false;
- ICmpInst::Predicate PredB;
+ CmpPredicate PredB;
Value *LHS2, *RHS2;
if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)),
m_ConstantInt(Less), m_ConstantInt(Greater))))
@@ -3604,7 +3622,8 @@ Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
m_OneUse(m_c_Or(m_CombineAnd(m_Value(Sel),
m_Select(m_Value(Cond), m_Value(TV),
m_Value(FV))),
- m_Value(Other))))) {
+ m_Value(Other)))) &&
+ Cond->getType() == Cmp.getType()) {
const SimplifyQuery Q = SQ.getWithInstruction(&Cmp);
// Easy case is if eq/ne matches whether 0 is trueval/falseval.
if (Pred == ICmpInst::ICMP_EQ
@@ -4546,7 +4565,7 @@ static Value *foldICmpWithLowBitMaskedVal(CmpPredicate Pred, Value *Op0,
static Value *
foldICmpWithTruncSignExtendedVal(ICmpInst &I,
InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate SrcPred;
+ CmpPredicate SrcPred;
Value *X;
const APInt *C0, *C1; // FIXME: non-splats, potentially with undef.
// We are ok with 'shl' having multiple uses, but 'ashr' must be one-use.
@@ -4792,7 +4811,7 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
/// Note that the comparison is commutative, while inverted (u>=, ==) predicate
/// will mean that we are looking for the opposite answer.
Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) {
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *X, *Y;
Instruction *Mul;
Instruction *Div;
@@ -4862,7 +4881,7 @@ Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) {
static Instruction *foldICmpXNegX(ICmpInst &I,
InstCombiner::BuilderTy &Builder) {
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *X;
if (match(&I, m_c_ICmp(Pred, m_NSWNeg(m_Value(X)), m_Deferred(X)))) {
@@ -5347,6 +5366,15 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
return new ICmpInst(Pred, X, Y);
if (ZKnown.isNegative())
return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), X, Y);
+ Value *LessThan = simplifyICmpInst(ICmpInst::ICMP_SLT, X, Y,
+ SQ.getWithInstruction(&I));
+ if (LessThan && match(LessThan, m_One()))
+ return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Z,
+ Constant::getNullValue(Z->getType()));
+ Value *GreaterThan = simplifyICmpInst(ICmpInst::ICMP_SGT, X, Y,
+ SQ.getWithInstruction(&I));
+ if (GreaterThan && match(GreaterThan, m_One()))
+ return new ICmpInst(Pred, Z, Constant::getNullValue(Z->getType()));
}
} else {
bool NonZero;
@@ -6794,7 +6822,7 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
/// then try to reduce patterns based on that limit.
Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) {
Value *X, *Y;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
// X must be 0 and bool must be true for "ULT":
// X <u (zext i1 Y) --> (X == 0) & Y
@@ -6809,7 +6837,7 @@ Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) {
return BinaryOperator::CreateOr(Builder.CreateIsNull(X), Y);
// icmp eq/ne X, (zext/sext (icmp eq/ne X, C))
- ICmpInst::Predicate Pred1, Pred2;
+ CmpPredicate Pred1, Pred2;
const APInt *C;
Instruction *ExtI;
if (match(&I, m_c_ICmp(Pred1, m_Value(X),
@@ -7079,7 +7107,7 @@ static Instruction *canonicalizeICmpBool(ICmpInst &I,
// (X l>> Y) == 0
static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp,
InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate Pred, NewPred;
+ CmpPredicate Pred, NewPred;
Value *X, *Y;
if (match(&Cmp,
m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) {
@@ -7244,7 +7272,7 @@ static Instruction *foldReductionIdiom(ICmpInst &I,
const DataLayout &DL) {
if (I.getType()->isVectorTy())
return nullptr;
- ICmpInst::Predicate OuterPred, InnerPred;
+ CmpPredicate OuterPred, InnerPred;
Value *LHS, *RHS;
// Match lowering of @llvm.vector.reduce.and. Turn
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 28474fec8238..3a074ee70dc4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -600,7 +600,8 @@ public:
/// Given a binary operator, cast instruction, or select which has a PHI node
/// as operand #0, see if we can fold the instruction into the PHI (which is
/// only possible if all operands to the PHI are constants).
- Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
+ Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN,
+ bool AllowMultipleUses = false);
/// For a binary operator with 2 phi operands, try to hoist the binary
/// operation before the phi. This can result in fewer instructions in
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 1fcf1c570add..272a1942c335 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -788,6 +788,9 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
BasicBlock *BB = std::get<0>(Incoming);
Value *V = std::get<1>(Incoming);
LoadInst *LI = cast<LoadInst>(V);
+ // FIXME: https://github.com/llvm/llvm-project/issues/121495
+ // Call combineMetadataForCSE instead, so that an explicit set of KnownIDs
+ // doesn't need to be maintained here.
combineMetadata(NewLI, LI, KnownIDs, true);
Value *NewInVal = LI->getOperand(0);
if (NewInVal != InVal)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index c7a0c35d099c..e7a8e947705f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -58,7 +58,7 @@ static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
// The select condition must be an equality compare with a constant operand.
Value *X;
Constant *C;
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(Sel.getCondition(), m_Cmp(Pred, m_Value(X), m_Constant(C))))
return nullptr;
@@ -425,17 +425,19 @@ Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI,
// icmp with a common operand also can have the common operand
// pulled after the select.
- ICmpInst::Predicate TPred, FPred;
+ CmpPredicate TPred, FPred;
if (match(TI, m_ICmp(TPred, m_Value(), m_Value())) &&
match(FI, m_ICmp(FPred, m_Value(), m_Value()))) {
- if (TPred == FPred || TPred == CmpInst::getSwappedPredicate(FPred)) {
- bool Swapped = TPred != FPred;
+ // FIXME: Use CmpPredicate::getMatching here.
+ CmpInst::Predicate T = TPred, F = FPred;
+ if (T == F || T == ICmpInst::getSwappedCmpPredicate(F)) {
+ bool Swapped = T != F;
if (Value *MatchOp =
getCommonOp(TI, FI, ICmpInst::isEquality(TPred), Swapped)) {
Value *NewSel = Builder.CreateSelect(Cond, OtherOpT, OtherOpF,
SI.getName() + ".v", &SI);
return new ICmpInst(
- MatchIsOpZero ? TPred : CmpInst::getSwappedPredicate(TPred),
+ MatchIsOpZero ? TPred : ICmpInst::getSwappedCmpPredicate(TPred),
MatchOp, NewSel);
}
}
@@ -640,7 +642,7 @@ static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp,
static Value *foldSelectICmpAndZeroShl(const ICmpInst *Cmp, Value *TVal,
Value *FVal,
InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *AndVal;
if (!match(Cmp, m_ICmp(Pred, m_Value(AndVal), m_Zero())))
return nullptr;
@@ -867,7 +869,7 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) {
auto *TrueVal = SI.getTrueValue();
auto *FalseVal = SI.getFalseValue();
Value *X, *Y;
- ICmpInst::Predicate Predicate;
+ CmpPredicate Predicate;
// Assuming that constant compared with zero is not undef (but it may be
// a vector with some undef elements). Otherwise (when a constant is undef)
@@ -1527,7 +1529,7 @@ static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
return nullptr;
Value *Cmp1;
- ICmpInst::Predicate Pred1;
+ CmpPredicate Pred1;
Constant *C2;
Value *ReplacementLow, *ReplacementHigh;
if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow),
@@ -1636,7 +1638,7 @@ static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
static Instruction *
tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
InstCombinerImpl &IC) {
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *X;
Constant *C0;
if (!match(&Cmp, m_OneUse(m_ICmp(
@@ -1734,7 +1736,7 @@ static Value *foldSelectInstWithICmpConst(SelectInst &SI, ICmpInst *ICI,
InstCombiner::BuilderTy &Builder) {
const APInt *CmpC;
Value *V;
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(ICI, m_ICmp(Pred, m_Value(V), m_APInt(CmpC))))
return nullptr;
@@ -1779,6 +1781,46 @@ static Value *foldSelectInstWithICmpConst(SelectInst &SI, ICmpInst *ICI,
return nullptr;
}
+/// `A == MIN_INT ? B != MIN_INT : A < B` --> `A < B`
+/// `A == MAX_INT ? B != MAX_INT : A > B` --> `A > B`
+static Instruction *foldSelectWithExtremeEqCond(Value *CmpLHS, Value *CmpRHS,
+ Value *TrueVal,
+ Value *FalseVal) {
+ Type *Ty = CmpLHS->getType();
+
+ if (Ty->isPtrOrPtrVectorTy())
+ return nullptr;
+
+ CmpPredicate Pred;
+ Value *B;
+
+ if (!match(FalseVal, m_c_ICmp(Pred, m_Specific(CmpLHS), m_Value(B))))
+ return nullptr;
+
+ Value *TValRHS;
+ if (!match(TrueVal, m_SpecificICmp(ICmpInst::ICMP_NE, m_Specific(B),
+ m_Value(TValRHS))))
+ return nullptr;
+
+ APInt C;
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+
+ if (ICmpInst::isLT(Pred)) {
+ C = CmpInst::isSigned(Pred) ? APInt::getSignedMinValue(BitWidth)
+ : APInt::getMinValue(BitWidth);
+ } else if (ICmpInst::isGT(Pred)) {
+ C = CmpInst::isSigned(Pred) ? APInt::getSignedMaxValue(BitWidth)
+ : APInt::getMaxValue(BitWidth);
+ } else {
+ return nullptr;
+ }
+
+ if (!match(CmpRHS, m_SpecificInt(C)) || !match(TValRHS, m_SpecificInt(C)))
+ return nullptr;
+
+ return new ICmpInst(Pred, CmpLHS, B);
+}
+
static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI,
InstCombinerImpl &IC) {
ICmpInst::Predicate Pred = ICI->getPredicate();
@@ -1793,6 +1835,10 @@ static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI,
if (Pred == ICmpInst::ICMP_NE)
std::swap(TrueVal, FalseVal);
+ if (Instruction *Res =
+ foldSelectWithExtremeEqCond(CmpLHS, CmpRHS, TrueVal, FalseVal))
+ return Res;
+
// Transform (X == C) ? X : Y -> (X == C) ? C : Y
// specific handling for Bitwise operation.
// x&y -> (x|y) ^ (x^y) or (x|y) & ~(x^y)
@@ -1890,7 +1936,7 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal,
BinaryOperator *BOp;
Constant *C1, *C2, *C3;
Value *X;
- ICmpInst::Predicate Predicate;
+ CmpPredicate Predicate;
if (!match(Cmp, m_ICmp(Predicate, m_Value(X), m_Constant(C1))))
return nullptr;
@@ -2138,7 +2184,7 @@ foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) {
auto IsSignedSaturateLimit = [&](Value *Limit, bool IsAdd) {
Type *Ty = Limit->getType();
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *TrueVal, *FalseVal, *Op;
const APInt *C;
if (!match(Limit, m_Select(m_ICmp(Pred, m_Value(Op), m_APInt(C)),
@@ -2347,7 +2393,7 @@ static Instruction *foldSelectCmpBitcasts(SelectInst &Sel,
Value *TVal = Sel.getTrueValue();
Value *FVal = Sel.getFalseValue();
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *A, *B;
if (!match(Cond, m_Cmp(Pred, m_Value(A), m_Value(B))))
return nullptr;
@@ -2552,7 +2598,7 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
Value *X;
const APInt *C;
bool IsTrueIfSignSet;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(Cond, m_OneUse(m_ICmp(Pred, m_ElementWiseBitCast(m_Value(X)),
m_APInt(C)))) ||
!isSignBitCheck(Pred, *C, IsTrueIfSignSet) || X->getType() != SelType)
@@ -2748,7 +2794,7 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC,
Value *TrueVal = SI.getTrueValue();
Value *FalseVal = SI.getFalseValue();
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *Op, *RemRes, *Remainder;
const APInt *C;
bool TrueIfSigned = false;
@@ -2807,7 +2853,7 @@ static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy
// a = select c, x, y ;
// f(a, c) ; f(poison, 1) cannot happen, but if a is folded
// ; to y, this can happen.
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (FI->hasOneUse() &&
match(Cond, m_c_ICmp(Pred, m_Specific(TrueVal), m_Specific(FalseVal))) &&
(Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) {
@@ -2856,7 +2902,7 @@ static Instruction *foldSelectWithFCmpToFabs(SelectInst &SI,
for (bool Swap : {false, true}) {
Value *TrueVal = SI.getTrueValue();
Value *X = SI.getFalseValue();
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (Swap)
std::swap(TrueVal, X);
@@ -2936,7 +2982,7 @@ static Instruction *foldSelectWithFCmpToFabs(SelectInst &SI,
if (Swap)
std::swap(TrueVal, X);
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
const APInt *C;
bool TrueIfSigned;
if (!match(CondVal,
@@ -2980,7 +3026,7 @@ foldRoundUpIntegerWithPow2Alignment(SelectInst &SI,
Value *X = SI.getTrueValue();
Value *XBiasedHighBits = SI.getFalseValue();
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *XLowBits;
if (!match(Cond, m_ICmp(Pred, m_Value(XLowBits), m_ZeroInt())) ||
!ICmpInst::isEquality(Pred))
@@ -3159,7 +3205,7 @@ static bool impliesPoisonOrCond(const Value *ValAssumedPoison, const Value *V,
Value *LHS = ICmp->getOperand(0);
const APInt *RHSC1;
const APInt *RHSC2;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (ICmp->hasSameSign() &&
match(ICmp->getOperand(1), m_APIntForbidPoison(RHSC1)) &&
match(V, m_ICmp(Pred, m_Specific(LHS), m_APIntAllowPoison(RHSC2)))) {
@@ -3170,7 +3216,7 @@ static bool impliesPoisonOrCond(const Value *ValAssumedPoison, const Value *V,
APInt::getZero(BitWidth))
: ConstantRange(APInt::getZero(BitWidth),
APInt::getSignedMinValue(BitWidth));
- return CRX.icmp(Expected ? Pred : ICmpInst::getInversePredicate(Pred),
+ return CRX.icmp(Expected ? Pred : ICmpInst::getInverseCmpPredicate(Pred),
*RHSC2);
}
}
@@ -3539,7 +3585,7 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder,
Value *FalseVal = SI.getFalseValue();
Value *TrueVal = SI.getTrueValue();
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
const APInt *Cond1;
Value *Cond0, *Ctlz, *CtlzOp;
if (!match(SI.getCondition(), m_ICmp(Pred, m_Value(Cond0), m_APInt(Cond1))))
@@ -3590,7 +3636,7 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
Value *TV = SI.getTrueValue();
Value *FV = SI.getFalseValue();
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *LHS, *RHS;
if (!match(SI.getCondition(), m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
return nullptr;
@@ -3610,7 +3656,7 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
bool IsSigned = ICmpInst::isSigned(Pred);
bool Replace = false;
- ICmpInst::Predicate ExtendedCmpPredicate;
+ CmpPredicate ExtendedCmpPredicate;
// (x < y) ? -1 : zext(x != y)
// (x < y) ? -1 : zext(x > y)
if (ICmpInst::isLT(Pred) && match(TV, m_AllOnes()) &&
@@ -3630,7 +3676,7 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
Replace = true;
// (x == y) ? 0 : (x > y ? 1 : -1)
- ICmpInst::Predicate FalseBranchSelectPredicate;
+ CmpPredicate FalseBranchSelectPredicate;
const APInt *InnerTV, *InnerFV;
if (Pred == ICmpInst::ICMP_EQ && match(TV, m_Zero()) &&
match(FV, m_Select(m_c_ICmp(FalseBranchSelectPredicate, m_Specific(LHS),
@@ -3723,22 +3769,9 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI,
if (!SIFOp || !SIFOp->hasNoSignedZeros() || !SIFOp->hasNoNaNs())
return nullptr;
- // select((fcmp Pred, X, 0), (fadd X, C), C)
- // => fadd((select (fcmp Pred, X, 0), X, 0), C)
- //
- // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE
- Instruction *FAdd;
- Constant *C;
- Value *X, *Z;
- CmpInst::Predicate Pred;
-
- // Note: OneUse check for `Cmp` is necessary because it makes sure that other
- // InstCombine folds don't undo this transformation and cause an infinite
- // loop. Furthermore, it could also increase the operation count.
- if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))),
- m_OneUse(m_Instruction(FAdd)), m_Constant(C))) ||
- match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))),
- m_Constant(C), m_OneUse(m_Instruction(FAdd))))) {
+ auto TryFoldIntoAddConstant =
+ [&Builder, &SI](CmpInst::Predicate Pred, Value *X, Value *Z,
+ Instruction *FAdd, Constant *C, bool Swapped) -> Value * {
// Only these relational predicates can be transformed into maxnum/minnum
// intrinsic.
if (!CmpInst::isRelational(Pred) || !match(Z, m_AnyZeroFP()))
@@ -3747,7 +3780,8 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI,
if (!match(FAdd, m_FAdd(m_Specific(X), m_Specific(C))))
return nullptr;
- Value *NewSelect = Builder.CreateSelect(SI.getCondition(), X, Z, "", &SI);
+ Value *NewSelect = Builder.CreateSelect(SI.getCondition(), Swapped ? Z : X,
+ Swapped ? X : Z, "", &SI);
NewSelect->takeName(&SI);
Value *NewFAdd = Builder.CreateFAdd(NewSelect, C);
@@ -3762,7 +3796,27 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI,
cast<Instruction>(NewSelect)->setFastMathFlags(NewFMF);
return NewFAdd;
- }
+ };
+
+ // select((fcmp Pred, X, 0), (fadd X, C), C)
+ // => fadd((select (fcmp Pred, X, 0), X, 0), C)
+ //
+ // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE
+ Instruction *FAdd;
+ Constant *C;
+ Value *X, *Z;
+ CmpPredicate Pred;
+
+ // Note: OneUse check for `Cmp` is necessary because it makes sure that other
+ // InstCombine folds don't undo this transformation and cause an infinite
+ // loop. Furthermore, it could also increase the operation count.
+ if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))),
+ m_OneUse(m_Instruction(FAdd)), m_Constant(C))))
+ return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/false);
+
+ if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))),
+ m_Constant(C), m_OneUse(m_Instruction(FAdd)))))
+ return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/true);
return nullptr;
}
@@ -3798,6 +3852,12 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
ConstantInt::getFalse(CondType), SQ,
/* AllowRefinement */ true))
return replaceOperand(SI, 2, S);
+
+ if (replaceInInstruction(TrueVal, CondVal,
+ ConstantInt::getTrue(CondType)) ||
+ replaceInInstruction(FalseVal, CondVal,
+ ConstantInt::getFalse(CondType)))
+ return &SI;
}
if (Instruction *R = foldSelectOfBools(SI))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 10c3ccdb2243..d511e79e3e48 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -427,7 +427,8 @@ Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
if (Instruction *R = FoldOpIntoSelect(I, SI))
return R;
- if (Constant *CUI = dyn_cast<Constant>(Op1))
+ Constant *CUI;
+ if (match(Op1, m_ImmConstant(CUI)))
if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
return Res;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 09eafd09451b..ce6154fd610e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -86,7 +86,7 @@ static bool cheapToScalarize(Value *V, Value *EI) {
if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI))
return true;
- CmpInst::Predicate UnusedPred;
+ CmpPredicate UnusedPred;
if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1)))))
if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI))
return true;
@@ -486,7 +486,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
}
Value *X, *Y;
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
cheapToScalarize(SrcVec, Index)) {
// extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)
@@ -2978,7 +2978,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
}
}
if (auto *PN = dyn_cast<PHINode>(LHS)) {
- if (Instruction *I = foldOpIntoPhi(SVI, PN))
+ if (Instruction *I = foldOpIntoPhi(SVI, PN, /*AllowMultipleUses=*/true))
return I;
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 3325a1868ebd..934156f04f7f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1763,7 +1763,8 @@ static Value *simplifyInstructionWithPHI(Instruction &I, PHINode *PN,
return nullptr;
}
-Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
+Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
+ bool AllowMultipleUses) {
unsigned NumPHIValues = PN->getNumIncomingValues();
if (NumPHIValues == 0)
return nullptr;
@@ -1771,7 +1772,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
// We normally only transform phis with a single use. However, if a PHI has
// multiple uses and they are all the same operation, we can fold *all* of the
// uses into the PHI.
- if (!PN->hasOneUse()) {
+ bool OneUse = PN->hasOneUse();
+ bool IdenticalUsers = false;
+ if (!AllowMultipleUses && !OneUse) {
// Walk the use list for the instruction, comparing them to I.
for (User *U : PN->users()) {
Instruction *UI = cast<Instruction>(U);
@@ -1779,6 +1782,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
return nullptr;
}
// Otherwise, we can replace *all* users with the new PHI we form.
+ IdenticalUsers = true;
}
// Check that all operands are phi-translatable.
@@ -1829,6 +1833,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
continue;
}
+ if (!OneUse && !IdenticalUsers)
+ return nullptr;
+
if (SeenNonSimplifiedInVal)
return nullptr; // More than one non-simplified value.
SeenNonSimplifiedInVal = true;
@@ -1890,17 +1897,22 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
for (unsigned i = 0; i != NumPHIValues; ++i)
NewPN->addIncoming(NewPhiValues[i], PN->getIncomingBlock(i));
- for (User *U : make_early_inc_range(PN->users())) {
- Instruction *User = cast<Instruction>(U);
- if (User == &I)
- continue;
- replaceInstUsesWith(*User, NewPN);
- eraseInstFromFunction(*User);
+ if (IdenticalUsers) {
+ for (User *U : make_early_inc_range(PN->users())) {
+ Instruction *User = cast<Instruction>(U);
+ if (User == &I)
+ continue;
+ replaceInstUsesWith(*User, NewPN);
+ eraseInstFromFunction(*User);
+ }
+ OneUse = true;
}
- replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
- const_cast<PHINode &>(*NewPN),
- const_cast<PHINode &>(*PN), DT);
+ if (OneUse) {
+ replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
+ const_cast<PHINode &>(*NewPN),
+ const_cast<PHINode &>(*PN), DT);
+ }
return replaceInstUsesWith(I, NewPN);
}
@@ -2756,6 +2768,111 @@ static bool shouldCanonicalizeGEPToPtrAdd(GetElementPtrInst &GEP) {
});
}
+static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN,
+ IRBuilderBase &Builder) {
+ auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
+ if (!Op1)
+ return nullptr;
+
+ // Don't fold a GEP into itself through a PHI node. This can only happen
+ // through the back-edge of a loop. Folding a GEP into itself means that
+ // the value of the previous iteration needs to be stored in the meantime,
+ // thus requiring an additional register variable to be live, but not
+ // actually achieving anything (the GEP still needs to be executed once per
+ // loop iteration).
+ if (Op1 == &GEP)
+ return nullptr;
+
+ int DI = -1;
+
+ for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
+ auto *Op2 = dyn_cast<GetElementPtrInst>(*I);
+ if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands() ||
+ Op1->getSourceElementType() != Op2->getSourceElementType())
+ return nullptr;
+
+ // As for Op1 above, don't try to fold a GEP into itself.
+ if (Op2 == &GEP)
+ return nullptr;
+
+ // Keep track of the type as we walk the GEP.
+ Type *CurTy = nullptr;
+
+ for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
+ if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
+ return nullptr;
+
+ if (Op1->getOperand(J) != Op2->getOperand(J)) {
+ if (DI == -1) {
+ // We have not seen any differences yet in the GEPs feeding the
+ // PHI yet, so we record this one if it is allowed to be a
+ // variable.
+
+ // The first two arguments can vary for any GEP, the rest have to be
+ // static for struct slots
+ if (J > 1) {
+ assert(CurTy && "No current type?");
+ if (CurTy->isStructTy())
+ return nullptr;
+ }
+
+ DI = J;
+ } else {
+ // The GEP is different by more than one input. While this could be
+ // extended to support GEPs that vary by more than one variable it
+ // doesn't make sense since it greatly increases the complexity and
+ // would result in an R+R+R addressing mode which no backend
+ // directly supports and would need to be broken into several
+ // simpler instructions anyway.
+ return nullptr;
+ }
+ }
+
+ // Sink down a layer of the type for the next iteration.
+ if (J > 0) {
+ if (J == 1) {
+ CurTy = Op1->getSourceElementType();
+ } else {
+ CurTy =
+ GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J));
+ }
+ }
+ }
+ }
+
+ // If not all GEPs are identical we'll have to create a new PHI node.
+ // Check that the old PHI node has only one use so that it will get
+ // removed.
+ if (DI != -1 && !PN->hasOneUse())
+ return nullptr;
+
+ auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+ if (DI == -1) {
+ // All the GEPs feeding the PHI are identical. Clone one down into our
+ // BB so that it can be merged with the current GEP.
+ } else {
+ // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
+ // into the current block so it can be merged, and create a new PHI to
+ // set that index.
+ PHINode *NewPN;
+ {
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(PN);
+ NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(),
+ PN->getNumOperands());
+ }
+
+ for (auto &I : PN->operands())
+ NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
+ PN->getIncomingBlock(I));
+
+ NewGEP->setOperand(DI, NewPN);
+ }
+
+ NewGEP->insertBefore(*GEP.getParent(), GEP.getParent()->getFirstInsertionPt());
+ return NewGEP;
+}
+
Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
Value *PtrOp = GEP.getOperand(0);
SmallVector<Value *, 8> Indices(GEP.indices());
@@ -2846,107 +2963,8 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// Check to see if the inputs to the PHI node are getelementptr instructions.
if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
- auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
- if (!Op1)
- return nullptr;
-
- // Don't fold a GEP into itself through a PHI node. This can only happen
- // through the back-edge of a loop. Folding a GEP into itself means that
- // the value of the previous iteration needs to be stored in the meantime,
- // thus requiring an additional register variable to be live, but not
- // actually achieving anything (the GEP still needs to be executed once per
- // loop iteration).
- if (Op1 == &GEP)
- return nullptr;
-
- int DI = -1;
-
- for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
- auto *Op2 = dyn_cast<GetElementPtrInst>(*I);
- if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands() ||
- Op1->getSourceElementType() != Op2->getSourceElementType())
- return nullptr;
-
- // As for Op1 above, don't try to fold a GEP into itself.
- if (Op2 == &GEP)
- return nullptr;
-
- // Keep track of the type as we walk the GEP.
- Type *CurTy = nullptr;
-
- for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
- if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
- return nullptr;
-
- if (Op1->getOperand(J) != Op2->getOperand(J)) {
- if (DI == -1) {
- // We have not seen any differences yet in the GEPs feeding the
- // PHI yet, so we record this one if it is allowed to be a
- // variable.
-
- // The first two arguments can vary for any GEP, the rest have to be
- // static for struct slots
- if (J > 1) {
- assert(CurTy && "No current type?");
- if (CurTy->isStructTy())
- return nullptr;
- }
-
- DI = J;
- } else {
- // The GEP is different by more than one input. While this could be
- // extended to support GEPs that vary by more than one variable it
- // doesn't make sense since it greatly increases the complexity and
- // would result in an R+R+R addressing mode which no backend
- // directly supports and would need to be broken into several
- // simpler instructions anyway.
- return nullptr;
- }
- }
-
- // Sink down a layer of the type for the next iteration.
- if (J > 0) {
- if (J == 1) {
- CurTy = Op1->getSourceElementType();
- } else {
- CurTy =
- GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J));
- }
- }
- }
- }
-
- // If not all GEPs are identical we'll have to create a new PHI node.
- // Check that the old PHI node has only one use so that it will get
- // removed.
- if (DI != -1 && !PN->hasOneUse())
- return nullptr;
-
- auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
- if (DI == -1) {
- // All the GEPs feeding the PHI are identical. Clone one down into our
- // BB so that it can be merged with the current GEP.
- } else {
- // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
- // into the current block so it can be merged, and create a new PHI to
- // set that index.
- PHINode *NewPN;
- {
- IRBuilderBase::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(PN);
- NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(),
- PN->getNumOperands());
- }
-
- for (auto &I : PN->operands())
- NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
- PN->getIncomingBlock(I));
-
- NewGEP->setOperand(DI, NewPN);
- }
-
- NewGEP->insertBefore(*GEP.getParent(), GEP.getParent()->getFirstInsertionPt());
- return replaceOperand(GEP, 0, NewGEP);
+ if (Value *NewPtrOp = foldGEPOfPhi(GEP, PN, Builder))
+ return replaceOperand(GEP, 0, NewPtrOp);
}
if (auto *Src = dyn_cast<GEPOperator>(PtrOp))
@@ -3113,6 +3131,15 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
}
}
+ // nusw + nneg -> nuw
+ if (GEP.hasNoUnsignedSignedWrap() && !GEP.hasNoUnsignedWrap() &&
+ all_of(GEP.indices(), [&](Value *Idx) {
+ return isKnownNonNegative(Idx, SQ.getWithInstruction(&GEP));
+ })) {
+ GEP.setNoWrapFlags(GEP.getNoWrapFlags() | GEPNoWrapFlags::noUnsignedWrap());
+ return &GEP;
+ }
+
if (Instruction *R = foldSelectGEP(GEP, Builder))
return R;
@@ -3443,7 +3470,7 @@ static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
// Validate the rest of constraint #1 by matching on the pred branch.
Instruction *TI = PredBB->getTerminator();
BasicBlock *TrueBB, *FalseBB;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(TI, m_Br(m_ICmp(Pred,
m_CombineOr(m_Specific(Op),
m_Specific(Op->stripPointerCasts())),
@@ -3724,7 +3751,7 @@ Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) {
return replaceOperand(BI, 0, ConstantInt::getFalse(Cond->getType()));
// Canonicalize, for example, fcmp_one -> fcmp_oeq.
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (match(Cond, m_OneUse(m_FCmp(Pred, m_Value(), m_Value()))) &&
!isCanonicalPredicate(Pred)) {
// Swap destinations and condition.
@@ -3785,7 +3812,7 @@ static Value *simplifySwitchOnSelectUsingRanges(SwitchInst &SI,
if (CstBB != SI.getDefaultDest())
return nullptr;
Value *X = Select->getOperand(3 - CstOpIdx);
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
const APInt *RHSC;
if (!match(Select->getCondition(),
m_ICmp(Pred, m_Specific(X), m_APInt(RHSC))))
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index b398a13383b9..41e503858124 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -8,6 +8,7 @@
#include "llvm/Transforms/Instrumentation/BoundsChecking.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -36,15 +37,16 @@ using namespace llvm;
static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
cl::desc("Use one trap block per function"));
-static cl::opt<bool> DebugTrapBB("bounds-checking-unique-traps",
- cl::desc("Always use one trap per check"));
-
STATISTIC(ChecksAdded, "Bounds checks added");
STATISTIC(ChecksSkipped, "Bounds checks skipped");
STATISTIC(ChecksUnable, "Bounds checks unable to add");
using BuilderTy = IRBuilder<TargetFolder>;
+BoundsCheckingPass::BoundsCheckingOptions::BoundsCheckingOptions(
+ ReportingMode Mode, bool Merge)
+ : Mode(Mode), Merge(Merge) {}
+
/// Gets the conditions under which memory accessing instructions will overflow.
///
/// \p Ptr is the pointer that will be read/written, and \p InstVal is either
@@ -104,6 +106,30 @@ static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
return Or;
}
+static CallInst *InsertTrap(BuilderTy &IRB, bool DebugTrapBB) {
+ if (!DebugTrapBB)
+ return IRB.CreateIntrinsic(Intrinsic::trap, {}, {});
+ // FIXME: Ideally we would use the SanitizerHandler::OutOfBounds constant.
+ return IRB.CreateIntrinsic(
+ Intrinsic::ubsantrap, {},
+ ConstantInt::get(IRB.getInt8Ty(),
+ IRB.GetInsertBlock()->getParent()->size()));
+}
+
+static CallInst *InsertCall(BuilderTy &IRB, bool MayReturn, StringRef Name) {
+ Function *Fn = IRB.GetInsertBlock()->getParent();
+ LLVMContext &Ctx = Fn->getContext();
+ llvm::AttrBuilder B(Ctx);
+ B.addAttribute(llvm::Attribute::NoUnwind);
+ if (!MayReturn)
+ B.addAttribute(llvm::Attribute::NoReturn);
+ FunctionCallee Callee = Fn->getParent()->getOrInsertFunction(
+ Name,
+ llvm::AttributeList::get(Ctx, llvm::AttributeList::FunctionIndex, B),
+ Type::getVoidTy(Ctx));
+ return IRB.CreateCall(Callee);
+}
+
/// Adds run-time bounds checks to memory accessing instructions.
///
/// \p Or is the condition that should guard the trap.
@@ -126,20 +152,56 @@ static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
BasicBlock *Cont = OldBB->splitBasicBlock(SplitI);
OldBB->getTerminator()->eraseFromParent();
+ BasicBlock *TrapBB = GetTrapBB(IRB, Cont);
+
if (C) {
// If we have a constant zero, unconditionally branch.
// FIXME: We should really handle this differently to bypass the splitting
// the block.
- BranchInst::Create(GetTrapBB(IRB), OldBB);
+ BranchInst::Create(TrapBB, OldBB);
return;
}
// Create the conditional branch.
- BranchInst::Create(GetTrapBB(IRB), Cont, Or, OldBB);
+ BranchInst::Create(TrapBB, Cont, Or, OldBB);
}
+struct ReportingOpts {
+ bool MayReturn = false;
+ bool UseTrap = false;
+ bool MinRuntime = false;
+ bool MayMerge = true;
+ StringRef Name;
+
+ ReportingOpts(BoundsCheckingPass::ReportingMode Mode, bool Merge) {
+ switch (Mode) {
+ case BoundsCheckingPass::ReportingMode::Trap:
+ UseTrap = true;
+ break;
+ case BoundsCheckingPass::ReportingMode::MinRuntime:
+ Name = "__ubsan_handle_local_out_of_bounds_minimal";
+ MinRuntime = true;
+ MayReturn = true;
+ break;
+ case BoundsCheckingPass::ReportingMode::MinRuntimeAbort:
+ Name = "__ubsan_handle_local_out_of_bounds_minimal_abort";
+ MinRuntime = true;
+ break;
+ case BoundsCheckingPass::ReportingMode::FullRuntime:
+ Name = "__ubsan_handle_local_out_of_bounds";
+ MayReturn = true;
+ break;
+ case BoundsCheckingPass::ReportingMode::FullRuntimeAbort:
+ Name = "__ubsan_handle_local_out_of_bounds_abort";
+ break;
+ }
+
+ MayMerge = Merge;
+ }
+};
+
static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
- ScalarEvolution &SE) {
+ ScalarEvolution &SE, const ReportingOpts &Opts) {
if (F.hasFnAttribute(Attribute::NoSanitizeBounds))
return false;
@@ -180,37 +242,43 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
// Create a trapping basic block on demand using a callback. Depending on
// flags, this will either create a single block for the entire function or
// will create a fresh block every time it is called.
- BasicBlock *TrapBB = nullptr;
- auto GetTrapBB = [&TrapBB](BuilderTy &IRB) {
+ BasicBlock *ReuseTrapBB = nullptr;
+ auto GetTrapBB = [&ReuseTrapBB, &Opts](BuilderTy &IRB, BasicBlock *Cont) {
Function *Fn = IRB.GetInsertBlock()->getParent();
auto DebugLoc = IRB.getCurrentDebugLocation();
IRBuilder<>::InsertPointGuard Guard(IRB);
- if (TrapBB && SingleTrapBB && !DebugTrapBB)
- return TrapBB;
+ // Create a trapping basic block on demand using a callback. Depending on
+ // flags, this will either create a single block for the entire function or
+ // will create a fresh block every time it is called.
+ if (ReuseTrapBB)
+ return ReuseTrapBB;
- TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+ BasicBlock *TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
IRB.SetInsertPoint(TrapBB);
- Intrinsic::ID IntrID = DebugTrapBB ? Intrinsic::ubsantrap : Intrinsic::trap;
+ bool DebugTrapBB = !Opts.MayMerge;
+ CallInst *TrapCall = Opts.UseTrap
+ ? InsertTrap(IRB, DebugTrapBB)
+ : InsertCall(IRB, Opts.MayReturn, Opts.Name);
+ if (DebugTrapBB)
+ TrapCall->addFnAttr(llvm::Attribute::NoMerge);
- CallInst *TrapCall;
- if (DebugTrapBB) {
- TrapCall = IRB.CreateIntrinsic(
- IntrID, {}, ConstantInt::get(IRB.getInt8Ty(), Fn->size()));
+ TrapCall->setDoesNotThrow();
+ TrapCall->setDebugLoc(DebugLoc);
+ if (Opts.MayReturn) {
+ IRB.CreateBr(Cont);
} else {
- TrapCall = IRB.CreateIntrinsic(IntrID, {}, {});
+ TrapCall->setDoesNotReturn();
+ IRB.CreateUnreachable();
}
- TrapCall->setDoesNotReturn();
- TrapCall->setDoesNotThrow();
- TrapCall->setDebugLoc(DebugLoc);
- IRB.CreateUnreachable();
+ if (!Opts.MayReturn && SingleTrapBB && !DebugTrapBB)
+ ReuseTrapBB = TrapBB;
return TrapBB;
};
- // Add the checks.
for (const auto &Entry : TrapInfo) {
Instruction *Inst = Entry.first;
BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL));
@@ -224,8 +292,35 @@ PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- if (!addBoundsChecking(F, TLI, SE))
+ if (!addBoundsChecking(F, TLI, SE,
+ ReportingOpts(Options.Mode, Options.Merge)))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
+
+void BoundsCheckingPass::printPipeline(
+ raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+ static_cast<PassInfoMixin<BoundsCheckingPass> *>(this)->printPipeline(
+ OS, MapClassName2PassName);
+ switch (Options.Mode) {
+ case ReportingMode::Trap:
+ OS << "<trap";
+ break;
+ case ReportingMode::MinRuntime:
+ OS << "<min-rt";
+ break;
+ case ReportingMode::MinRuntimeAbort:
+ OS << "<min-rt-abort";
+ break;
+ case ReportingMode::FullRuntime:
+ OS << "<rt";
+ break;
+ case ReportingMode::FullRuntimeAbort:
+ OS << "<rt-abort";
+ break;
+ }
+ if (Options.Merge)
+ OS << ";merge";
+ OS << ">";
+}
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 3e3c3eced4bb..5c437437fe36 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -19,11 +19,11 @@ add_llvm_component_library(LLVMInstrumentation
PGOForceFunctionAttrs.cpp
PGOInstrumentation.cpp
PGOMemOPSizeOpt.cpp
- PoisonChecking.cpp
SanitizerCoverage.cpp
SanitizerBinaryMetadata.cpp
ValueProfileCollector.cpp
ThreadSanitizer.cpp
+ TypeSanitizer.cpp
HWAddressSanitizer.cpp
RealtimeSanitizer.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index f9be7f933d31..6e86ffdc8027 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -61,7 +61,7 @@ enum : uint32_t {
};
static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version",
- cl::init("408*"), cl::Hidden,
+ cl::init("0000"), cl::Hidden,
cl::ValueRequired);
static cl::opt<bool> AtomicCounter("gcov-atomic-counter", cl::Hidden,
@@ -154,6 +154,7 @@ private:
GCOVOptions Options;
llvm::endianness Endian;
raw_ostream *os;
+ int Version = 0;
// Checksum, produced by hash of EdgeDestinations
SmallVector<uint32_t, 4> FileChecksums;
@@ -334,12 +335,9 @@ namespace {
: GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident),
Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) {
LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
- bool ExitBlockBeforeBody = Version >= 48;
- uint32_t i = ExitBlockBeforeBody ? 2 : 1;
+ uint32_t i = 2;
for (BasicBlock &BB : *F)
Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++)));
- if (!ExitBlockBeforeBody)
- ReturnBlock.Number = i;
std::string FunctionNameAndLine;
raw_string_ostream FNLOS(FunctionNameAndLine);
@@ -363,44 +361,28 @@ namespace {
void writeOut(uint32_t CfgChecksum) {
write(GCOV_TAG_FUNCTION);
SmallString<128> Filename = getFilename(SP);
- uint32_t BlockLen =
- 2 + (Version >= 47) + wordsOfString(getFunctionName(SP));
- if (Version < 80)
- BlockLen += wordsOfString(Filename) + 1;
- else
- BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90);
+ uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP));
+ BlockLen += 1 + wordsOfString(Filename) + 4;
write(BlockLen);
write(Ident);
write(FuncChecksum);
- if (Version >= 47)
- write(CfgChecksum);
+ write(CfgChecksum);
writeString(getFunctionName(SP));
- if (Version < 80) {
- writeString(Filename);
- write(SP->getLine());
- } else {
- write(SP->isArtificial()); // artificial
- writeString(Filename);
- write(SP->getLine()); // start_line
- write(0); // start_column
- // EndLine is the last line with !dbg. It is not the } line as in GCC,
- // but good enough.
- write(EndLine);
- if (Version >= 90)
- write(0); // end_column
- }
+
+ write(SP->isArtificial()); // artificial
+ writeString(Filename);
+ write(SP->getLine()); // start_line
+ write(0); // start_column
+ // EndLine is the last line with !dbg. It is not the } line as in GCC,
+ // but good enough.
+ write(EndLine);
+ write(0); // end_column
// Emit count of blocks.
write(GCOV_TAG_BLOCKS);
- if (Version < 80) {
- write(Blocks.size() + 2);
- for (int i = Blocks.size() + 2; i; --i)
- write(0);
- } else {
- write(1);
- write(Blocks.size() + 2);
- }
+ write(1);
+ write(Blocks.size() + 2);
LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n");
// Emit edges between blocks.
@@ -767,7 +749,6 @@ bool GCOVProfiler::emitProfileNotes(
function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
function_ref<const TargetLibraryInfo &(Function &F)> GetTLI) {
- int Version;
{
uint8_t c3 = Options.Version[0];
uint8_t c2 = Options.Version[1];
@@ -775,6 +756,11 @@ bool GCOVProfiler::emitProfileNotes(
Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0'
: (c3 - '0') * 10 + c1 - '0';
}
+ // Emit .gcno files that are compatible with GCC 11.1.
+ if (Version < 111) {
+ Version = 111;
+ memcpy(Options.Version, "B11*", 4);
+ }
bool EmitGCDA = Options.EmitData;
for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) {
@@ -973,10 +959,8 @@ bool GCOVProfiler::emitProfileNotes(
out.write(Tmp, 4);
}
write(Stamp);
- if (Version >= 90)
- writeString(""); // unuseful current_working_directory
- if (Version >= 80)
- write(0); // unuseful has_unexecuted_blocks
+ writeString("."); // unuseful current_working_directory
+ write(0); // unuseful has_unexecuted_blocks
for (auto &Func : Funcs)
Func->writeOut(Stamp);
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 33a7a37fa28e..f1580b025efc 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -152,7 +152,7 @@ static cl::opt<int> ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"),
// override these hints anyway.
static cl::opt<bool> ClMemProfMatchHotColdNew(
"memprof-match-hot-cold-new",
- cl::desc(
+ cl::desc(
"Match allocation profiles onto existing hot/cold operator new calls"),
cl::Hidden, cl::init(false));
@@ -166,8 +166,26 @@ static cl::opt<bool>
"context in this module's profiles"),
cl::Hidden, cl::init(false));
+static cl::opt<std::string>
+ MemprofRuntimeDefaultOptions("memprof-runtime-default-options",
+ cl::desc("The default memprof options"),
+ cl::Hidden, cl::init(""));
+
+static cl::opt<bool>
+ SalvageStaleProfile("memprof-salvage-stale-profile",
+ cl::desc("Salvage stale MemProf profile"),
+ cl::init(false), cl::Hidden);
+
+cl::opt<unsigned> MinClonedColdBytePercent(
+ "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden,
+ cl::desc("Min percent of cold bytes to hint alloc cold during cloning"));
+
extern cl::opt<bool> MemProfReportHintedSizes;
+static cl::opt<unsigned> MinMatchedColdBytePercent(
+ "memprof-matching-cold-threshold", cl::init(100), cl::Hidden,
+ cl::desc("Min percent of cold bytes matched to hint allocation cold"));
+
// Instrumentation statistics
STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
@@ -547,6 +565,20 @@ void createMemprofHistogramFlagVar(Module &M) {
appendToCompilerUsed(M, MemprofHistogramFlag);
}
+void createMemprofDefaultOptionsVar(Module &M) {
+ Constant *OptionsConst = ConstantDataArray::getString(
+ M.getContext(), MemprofRuntimeDefaultOptions, /*AddNull=*/true);
+ GlobalVariable *OptionsVar =
+ new GlobalVariable(M, OptionsConst->getType(), /*isConstant=*/true,
+ GlobalValue::WeakAnyLinkage, OptionsConst,
+ "__memprof_default_options_str");
+ Triple TT(M.getTargetTriple());
+ if (TT.supportsCOMDAT()) {
+ OptionsVar->setLinkage(GlobalValue::ExternalLinkage);
+ OptionsVar->setComdat(M.getOrInsertComdat(OptionsVar->getName()));
+ }
+}
+
bool ModuleMemProfiler::instrumentModule(Module &M) {
// Create a module constructor.
@@ -566,6 +598,8 @@ bool ModuleMemProfiler::instrumentModule(Module &M) {
createMemprofHistogramFlagVar(M);
+ createMemprofDefaultOptionsVar(M);
+
return true;
}
@@ -704,8 +738,7 @@ static uint64_t computeStackId(const memprof::Frame &Frame) {
// Helper to generate a single hash id for a given callstack, used for emitting
// matching statistics and useful for uniquing such statistics across modules.
-static uint64_t
-computeFullStackId(const std::vector<memprof::Frame> &CallStack) {
+static uint64_t computeFullStackId(ArrayRef<Frame> CallStack) {
llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little>
HashBuilder;
for (auto &F : CallStack)
@@ -726,7 +759,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie,
AllocInfo->Info.getAllocCount(),
AllocInfo->Info.getTotalLifetime());
std::vector<ContextTotalSize> ContextSizeInfo;
- if (MemProfReportHintedSizes) {
+ if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) {
auto TotalSize = AllocInfo->Info.getTotalSize();
assert(TotalSize);
assert(FullStackId != 0);
@@ -742,9 +775,8 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie,
// non-zero.
static bool
stackFrameIncludesInlinedCallStack(ArrayRef<Frame> ProfileCallStack,
- ArrayRef<uint64_t> InlinedCallStack,
- unsigned StartIndex = 0) {
- auto StackFrame = ProfileCallStack.begin() + StartIndex;
+ ArrayRef<uint64_t> InlinedCallStack) {
+ auto StackFrame = ProfileCallStack.begin();
auto InlCallStackIter = InlinedCallStack.begin();
for (; StackFrame != ProfileCallStack.end() &&
InlCallStackIter != InlinedCallStack.end();
@@ -800,7 +832,8 @@ struct AllocMatchInfo {
};
DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>>
-memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI) {
+memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI,
+ function_ref<bool(uint64_t)> IsPresentInProfile) {
DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> Calls;
auto GetOffset = [](const DILocation *DIL) {
@@ -824,7 +857,12 @@ memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI) {
continue;
StringRef CalleeName = CalledFunction->getName();
+ // True if we are calling a heap allocation function that supports
+ // hot/cold variants.
bool IsAlloc = isAllocationWithHotColdVariant(CalledFunction, TLI);
+ // True for the first iteration below, indicating that we are looking at
+ // a leaf node.
+ bool IsLeaf = true;
for (const DILocation *DIL = I.getDebugLoc(); DIL;
DIL = DIL->getInlinedAt()) {
StringRef CallerName = DIL->getSubprogramLinkageName();
@@ -833,16 +871,27 @@ memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI) {
uint64_t CallerGUID = IndexedMemProfRecord::getGUID(CallerName);
uint64_t CalleeGUID = IndexedMemProfRecord::getGUID(CalleeName);
// Pretend that we are calling a function with GUID == 0 if we are
- // calling a heap allocation function.
- if (IsAlloc)
- CalleeGUID = 0;
+ // in the inline stack leading to a heap allocation function.
+ if (IsAlloc) {
+ if (IsLeaf) {
+ // For leaf nodes, set CalleeGUID to 0 without consulting
+ // IsPresentInProfile.
+ CalleeGUID = 0;
+ } else if (!IsPresentInProfile(CalleeGUID)) {
+ // In addition to the leaf case above, continue to set CalleeGUID
+ // to 0 as long as we don't see CalleeGUID in the profile.
+ CalleeGUID = 0;
+ } else {
+ // Once we encounter a callee that exists in the profile, stop
+ // setting CalleeGUID to 0.
+ IsAlloc = false;
+ }
+ }
+
LineLocation Loc = {GetOffset(DIL), DIL->getColumn()};
Calls[CallerGUID].emplace_back(Loc, CalleeGUID);
CalleeName = CallerName;
- // FIXME: Recognize other frames that are associated with heap
- // allocation functions. It may be too early to reset IsAlloc to
- // false here.
- IsAlloc = false;
+ IsLeaf = false;
}
}
}
@@ -865,7 +914,9 @@ memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader,
DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromProfile =
MemProfReader->getMemProfCallerCalleePairs();
DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromIR =
- extractCallsFromIR(M, TLI);
+ extractCallsFromIR(M, TLI, [&](uint64_t GUID) {
+ return CallsFromProfile.contains(GUID);
+ });
// Compute an undrift map for each CallerGUID.
for (const auto &[CallerGUID, IRAnchors] : CallsFromIR) {
@@ -888,10 +939,38 @@ memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader,
return UndriftMaps;
}
+// Given a MemProfRecord, undrift all the source locations present in the
+// record in place.
+static void
+undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
+ memprof::MemProfRecord &MemProfRec) {
+ // Undrift a call stack in place.
+ auto UndriftCallStack = [&](std::vector<Frame> &CallStack) {
+ for (auto &F : CallStack) {
+ auto I = UndriftMaps.find(F.Function);
+ if (I == UndriftMaps.end())
+ continue;
+ auto J = I->second.find(LineLocation(F.LineOffset, F.Column));
+ if (J == I->second.end())
+ continue;
+ auto &NewLoc = J->second;
+ F.LineOffset = NewLoc.LineOffset;
+ F.Column = NewLoc.Column;
+ }
+ };
+
+ for (auto &AS : MemProfRec.AllocSites)
+ UndriftCallStack(AS.CallStack);
+
+ for (auto &CS : MemProfRec.CallSites)
+ UndriftCallStack(CS);
+}
+
static void
readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
const TargetLibraryInfo &TLI,
- std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo) {
+ std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
+ DenseMap<uint64_t, LocToLocMap> &UndriftMaps) {
auto &Ctx = M.getContext();
// Previously we used getIRPGOFuncName() here. If F is local linkage,
// getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But
@@ -939,6 +1018,11 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
NumOfMemProfFunc++;
+ // If requested, undrfit MemProfRecord so that the source locations in it
+ // match those in the IR.
+ if (SalvageStaleProfile)
+ undriftMemProfRecord(UndriftMaps, *MemProfRec);
+
// Detect if there are non-zero column numbers in the profile. If not,
// treat all column numbers as 0 when matching (i.e. ignore any non-zero
// columns in the IR). The profiled binary might have been built with
@@ -948,9 +1032,15 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
// Build maps of the location hash to all profile data with that leaf location
// (allocation info and the callsites).
std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
- // For the callsites we need to record the index of the associated frame in
- // the frame array (see comments below where the map entries are added).
- std::map<uint64_t, std::set<std::pair<const std::vector<Frame> *, unsigned>>>
+ // A hash function for std::unordered_set<ArrayRef<Frame>> to work.
+ struct CallStackHash {
+ size_t operator()(ArrayRef<Frame> CS) const {
+ return computeFullStackId(CS);
+ }
+ };
+ // For the callsites we need to record slices of the frame array (see comments
+ // below where the map entries are added).
+ std::map<uint64_t, std::unordered_set<ArrayRef<Frame>, CallStackHash>>
LocHashToCallSites;
for (auto &AI : MemProfRec->AllocSites) {
NumOfMemProfAllocContextProfiles++;
@@ -968,7 +1058,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
unsigned Idx = 0;
for (auto &StackFrame : CS) {
uint64_t StackId = computeStackId(StackFrame);
- LocHashToCallSites[StackId].insert(std::make_pair(&CS, Idx++));
+ LocHashToCallSites[StackId].insert(ArrayRef<Frame>(CS).drop_front(Idx++));
ProfileHasColumns |= StackFrame.Column;
// Once we find this function, we can stop recording.
if (StackFrame.Function == FuncGUID)
@@ -1008,8 +1098,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
// and another callsite).
std::map<uint64_t, std::set<const AllocationInfo *>>::iterator
AllocInfoIter;
- std::map<uint64_t, std::set<std::pair<const std::vector<Frame> *,
- unsigned>>>::iterator CallSitesIter;
+ decltype(LocHashToCallSites)::iterator CallSitesIter;
for (const DILocation *DIL = I.getDebugLoc(); DIL != nullptr;
DIL = DIL->getInlinedAt()) {
// Use C++ linkage name if possible. Need to compile with
@@ -1050,6 +1139,8 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
// contexts. Add them to a Trie specialized for trimming the contexts to
// the minimal needed to disambiguate contexts with unique behavior.
CallStackTrie AllocTrie;
+ uint64_t TotalSize = 0;
+ uint64_t TotalColdSize = 0;
for (auto *AllocInfo : AllocInfoIter->second) {
// Check the full inlined call stack against this one.
// If we found and thus matched all frames on the call, include
@@ -1058,9 +1149,13 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
InlinedCallStack)) {
NumOfMemProfMatchedAllocContexts++;
uint64_t FullStackId = 0;
- if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes)
+ if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes ||
+ MinClonedColdBytePercent < 100)
FullStackId = computeFullStackId(AllocInfo->CallStack);
auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
+ TotalSize += AllocInfo->Info.getTotalSize();
+ if (AllocType == AllocationType::Cold)
+ TotalColdSize += AllocInfo->Info.getTotalSize();
// Record information about the allocation if match info printing
// was requested.
if (ClPrintMemProfMatchInfo) {
@@ -1070,6 +1165,16 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
}
}
}
+ // If the threshold for the percent of cold bytes is less than 100%,
+ // and not all bytes are cold, see if we should still hint this
+ // allocation as cold without context sensitivity.
+ if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
+ TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
+ AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold,
+ "dominant");
+ continue;
+ }
+
// We might not have matched any to the full inlined call stack.
// But if we did, create and attach metadata, or a function attribute if
// all contexts have identical profiled behavior.
@@ -1100,8 +1205,8 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
for (auto CallStackIdx : CallSitesIter->second) {
// If we found and thus matched all frames on the call, create and
// attach call stack metadata.
- if (stackFrameIncludesInlinedCallStack(
- *CallStackIdx.first, InlinedCallStack, CallStackIdx.second)) {
+ if (stackFrameIncludesInlinedCallStack(CallStackIdx,
+ InlinedCallStack)) {
NumOfMemProfMatchedCallSites++;
addCallsiteMetadata(I, InlinedCallStack, Ctx);
// Only need to find one with a matching call stack and add a single
@@ -1152,6 +1257,11 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(*M.begin());
+ DenseMap<uint64_t, LocToLocMap> UndriftMaps;
+ if (SalvageStaleProfile)
+ UndriftMaps = computeUndriftMap(M, MemProfReader.get(), TLI);
+
// Map from the stack has of each allocation context in the function profiles
// to the total profiled size (bytes), allocation type, and whether we matched
// it to an allocation in the IR.
@@ -1162,7 +1272,8 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
continue;
const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
- readMemprof(M, F, MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo);
+ readMemprof(M, F, MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo,
+ UndriftMaps);
}
if (ClPrintMemProfMatchInfo) {
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 5c419c6374bd..429e323b6b7c 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4026,6 +4026,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
+ // Approximation only
+ void handleNEONVectorMultiplyIntrinsic(IntrinsicInst &I) {
+ handleShadowOr(I);
+ }
+
void visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
case Intrinsic::uadd_with_overflow:
@@ -4341,6 +4346,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
handlePclmulIntrinsic(I);
break;
+ case Intrinsic::x86_avx_round_pd_256:
+ case Intrinsic::x86_avx_round_ps_256:
case Intrinsic::x86_sse41_round_pd:
case Intrinsic::x86_sse41_round_ps:
handleRoundPdPsIntrinsic(I);
@@ -4429,6 +4436,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}
+ case Intrinsic::aarch64_neon_fmulx:
+ case Intrinsic::aarch64_neon_pmul:
+ case Intrinsic::aarch64_neon_pmull:
+ case Intrinsic::aarch64_neon_smull:
+ case Intrinsic::aarch64_neon_pmull64:
+ case Intrinsic::aarch64_neon_umull: {
+ handleNEONVectorMultiplyIntrinsic(I);
+ break;
+ }
+
default:
if (!handleUnknownIntrinsic(I))
visitInstruction(I);
diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
deleted file mode 100644
index e094acdc3178..000000000000
--- a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ /dev/null
@@ -1,358 +0,0 @@
-//===- PoisonChecking.cpp - -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implements a transform pass which instruments IR such that poison semantics
-// are made explicit. That is, it provides a (possibly partial) executable
-// semantics for every instruction w.r.t. poison as specified in the LLVM
-// LangRef. There are obvious parallels to the sanitizer tools, but this pass
-// is focused purely on the semantics of LLVM IR, not any particular source
-// language. If you're looking for something to see if your C/C++ contains
-// UB, this is not it.
-//
-// The rewritten semantics of each instruction will include the following
-// components:
-//
-// 1) The original instruction, unmodified.
-// 2) A propagation rule which translates dynamic information about the poison
-// state of each input to whether the dynamic output of the instruction
-// produces poison.
-// 3) A creation rule which validates any poison producing flags on the
-// instruction itself (e.g. checks for overflow on nsw).
-// 4) A check rule which traps (to a handler function) if this instruction must
-// execute undefined behavior given the poison state of it's inputs.
-//
-// This is a must analysis based transform; that is, the resulting code may
-// produce a false negative result (not report UB when actually exists
-// according to the LangRef spec), but should never produce a false positive
-// (report UB where it doesn't exist).
-//
-// Use cases for this pass include:
-// - Understanding (and testing!) the implications of the definition of poison
-// from the LangRef.
-// - Validating the output of a IR fuzzer to ensure that all programs produced
-// are well defined on the specific input used.
-// - Finding/confirming poison specific miscompiles by checking the poison
-// status of an input/IR pair is the same before and after an optimization
-// transform.
-// - Checking that a bugpoint reduction does not introduce UB which didn't
-// exist in the original program being reduced.
-//
-// The major sources of inaccuracy are currently:
-// - Most validation rules not yet implemented for instructions with poison
-// relavant flags. At the moment, only nsw/nuw on add/sub are supported.
-// - UB which is control dependent on a branch on poison is not yet
-// reported. Currently, only data flow dependence is modeled.
-// - Poison which is propagated through memory is not modeled. As such,
-// storing poison to memory and then reloading it will cause a false negative
-// as we consider the reloaded value to not be poisoned.
-// - Poison propagation across function boundaries is not modeled. At the
-// moment, all arguments and return values are assumed not to be poison.
-// - Undef is not modeled. In particular, the optimizer's freedom to pick
-// concrete values for undef bits so as to maximize potential for producing
-// poison is not modeled.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/PoisonChecking.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "poison-checking"
-
-static cl::opt<bool>
-LocalCheck("poison-checking-function-local",
- cl::init(false),
- cl::desc("Check that returns are non-poison (for testing)"));
-
-
-static bool isConstantFalse(Value* V) {
- assert(V->getType()->isIntegerTy(1));
- if (auto *CI = dyn_cast<ConstantInt>(V))
- return CI->isZero();
- return false;
-}
-
-static Value *buildOrChain(IRBuilder<> &B, ArrayRef<Value*> Ops) {
- if (Ops.size() == 0)
- return B.getFalse();
- unsigned i = 0;
- for (; i < Ops.size() && isConstantFalse(Ops[i]); i++) {}
- if (i == Ops.size())
- return B.getFalse();
- Value *Accum = Ops[i++];
- for (Value *Op : llvm::drop_begin(Ops, i))
- if (!isConstantFalse(Op))
- Accum = B.CreateOr(Accum, Op);
- return Accum;
-}
-
-static void generateCreationChecksForBinOp(Instruction &I,
- SmallVectorImpl<Value*> &Checks) {
- assert(isa<BinaryOperator>(I));
-
- IRBuilder<> B(&I);
- Value *LHS = I.getOperand(0);
- Value *RHS = I.getOperand(1);
- switch (I.getOpcode()) {
- default:
- return;
- case Instruction::Add: {
- if (I.hasNoSignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- if (I.hasNoUnsignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::uadd_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- break;
- }
- case Instruction::Sub: {
- if (I.hasNoSignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::ssub_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- if (I.hasNoUnsignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- break;
- }
- case Instruction::Mul: {
- if (I.hasNoSignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::smul_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- if (I.hasNoUnsignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::umul_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- break;
- }
- case Instruction::UDiv: {
- if (I.isExact()) {
- auto *Check =
- B.CreateICmp(ICmpInst::ICMP_NE, B.CreateURem(LHS, RHS),
- ConstantInt::get(LHS->getType(), 0));
- Checks.push_back(Check);
- }
- break;
- }
- case Instruction::SDiv: {
- if (I.isExact()) {
- auto *Check =
- B.CreateICmp(ICmpInst::ICMP_NE, B.CreateSRem(LHS, RHS),
- ConstantInt::get(LHS->getType(), 0));
- Checks.push_back(Check);
- }
- break;
- }
- case Instruction::AShr:
- case Instruction::LShr:
- case Instruction::Shl: {
- Value *ShiftCheck =
- B.CreateICmp(ICmpInst::ICMP_UGE, RHS,
- ConstantInt::get(RHS->getType(),
- LHS->getType()->getScalarSizeInBits()));
- Checks.push_back(ShiftCheck);
- break;
- }
- };
-}
-
-/// Given an instruction which can produce poison on non-poison inputs
-/// (i.e. canCreatePoison returns true), generate runtime checks to produce
-/// boolean indicators of when poison would result.
-static void generateCreationChecks(Instruction &I,
- SmallVectorImpl<Value*> &Checks) {
- IRBuilder<> B(&I);
- if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy())
- generateCreationChecksForBinOp(I, Checks);
-
- // Handle non-binops separately
- switch (I.getOpcode()) {
- default:
- // Note there are a couple of missing cases here, once implemented, this
- // should become an llvm_unreachable.
- break;
- case Instruction::ExtractElement: {
- Value *Vec = I.getOperand(0);
- auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
- if (!VecVTy)
- break;
- Value *Idx = I.getOperand(1);
- unsigned NumElts = VecVTy->getNumElements();
- Value *Check =
- B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
- ConstantInt::get(Idx->getType(), NumElts));
- Checks.push_back(Check);
- break;
- }
- case Instruction::InsertElement: {
- Value *Vec = I.getOperand(0);
- auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
- if (!VecVTy)
- break;
- Value *Idx = I.getOperand(2);
- unsigned NumElts = VecVTy->getNumElements();
- Value *Check =
- B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
- ConstantInt::get(Idx->getType(), NumElts));
- Checks.push_back(Check);
- break;
- }
- };
-}
-
-static Value *getPoisonFor(DenseMap<Value *, Value *> &ValToPoison, Value *V) {
- auto Itr = ValToPoison.find(V);
- if (Itr != ValToPoison.end())
- return Itr->second;
- if (isa<Constant>(V)) {
- return ConstantInt::getFalse(V->getContext());
- }
- // Return false for unknwon values - this implements a non-strict mode where
- // unhandled IR constructs are simply considered to never produce poison. At
- // some point in the future, we probably want a "strict mode" for testing if
- // nothing else.
- return ConstantInt::getFalse(V->getContext());
-}
-
-static void CreateAssert(IRBuilder<> &B, Value *Cond) {
- assert(Cond->getType()->isIntegerTy(1));
- if (auto *CI = dyn_cast<ConstantInt>(Cond))
- if (CI->isAllOnesValue())
- return;
-
- Module *M = B.GetInsertBlock()->getModule();
- M->getOrInsertFunction("__poison_checker_assert",
- Type::getVoidTy(M->getContext()),
- Type::getInt1Ty(M->getContext()));
- Function *TrapFunc = M->getFunction("__poison_checker_assert");
- B.CreateCall(TrapFunc, Cond);
-}
-
-static void CreateAssertNot(IRBuilder<> &B, Value *Cond) {
- assert(Cond->getType()->isIntegerTy(1));
- CreateAssert(B, B.CreateNot(Cond));
-}
-
-static bool rewrite(Function &F) {
- auto * const Int1Ty = Type::getInt1Ty(F.getContext());
-
- DenseMap<Value *, Value *> ValToPoison;
-
- for (BasicBlock &BB : F)
- for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
- auto *OldPHI = cast<PHINode>(&*I);
- auto *NewPHI = PHINode::Create(Int1Ty, OldPHI->getNumIncomingValues());
- for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++)
- NewPHI->addIncoming(UndefValue::get(Int1Ty),
- OldPHI->getIncomingBlock(i));
- NewPHI->insertBefore(OldPHI);
- ValToPoison[OldPHI] = NewPHI;
- }
-
- for (BasicBlock &BB : F)
- for (Instruction &I : BB) {
- if (isa<PHINode>(I)) continue;
-
- IRBuilder<> B(cast<Instruction>(&I));
-
- // Note: There are many more sources of documented UB, but this pass only
- // attempts to find UB triggered by propagation of poison.
- SmallVector<const Value *, 4> NonPoisonOps;
- SmallPtrSet<const Value *, 4> SeenNonPoisonOps;
- getGuaranteedNonPoisonOps(&I, NonPoisonOps);
- for (const Value *Op : NonPoisonOps)
- if (SeenNonPoisonOps.insert(Op).second)
- CreateAssertNot(B,
- getPoisonFor(ValToPoison, const_cast<Value *>(Op)));
-
- if (LocalCheck)
- if (auto *RI = dyn_cast<ReturnInst>(&I))
- if (RI->getNumOperands() != 0) {
- Value *Op = RI->getOperand(0);
- CreateAssertNot(B, getPoisonFor(ValToPoison, Op));
- }
-
- SmallVector<Value*, 4> Checks;
- for (const Use &U : I.operands()) {
- if (ValToPoison.count(U) && propagatesPoison(U))
- Checks.push_back(getPoisonFor(ValToPoison, U));
- }
-
- if (canCreatePoison(cast<Operator>(&I)))
- generateCreationChecks(I, Checks);
- ValToPoison[&I] = buildOrChain(B, Checks);
- }
-
- for (BasicBlock &BB : F)
- for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
- auto *OldPHI = cast<PHINode>(&*I);
- if (!ValToPoison.count(OldPHI))
- continue; // skip the newly inserted phis
- auto *NewPHI = cast<PHINode>(ValToPoison[OldPHI]);
- for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) {
- auto *OldVal = OldPHI->getIncomingValue(i);
- NewPHI->setIncomingValue(i, getPoisonFor(ValToPoison, OldVal));
- }
- }
- return true;
-}
-
-
-PreservedAnalyses PoisonCheckingPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- bool Changed = false;
- for (auto &F : M)
- Changed |= rewrite(F);
-
- return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
-
-PreservedAnalyses PoisonCheckingPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- return rewrite(F) ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
-
-/* Major TODO Items:
- - Control dependent poison UB
- - Strict mode - (i.e. must analyze every operand)
- - Poison through memory
- - Function ABIs
- - Full coverage of intrinsics, etc.. (ouch)
-
- Instructions w/Unclear Semantics:
- - shufflevector - It would seem reasonable for an out of bounds mask element
- to produce poison, but the LangRef does not state.
- - all binary ops w/vector operands - The likely interpretation would be that
- any element overflowing should produce poison for the entire result, but
- the LangRef does not state.
- - Floating point binary ops w/fmf flags other than (nnan, noinfs). It seems
- strange that only certian flags should be documented as producing poison.
-
- Cases of clear poison semantics not yet implemented:
- - Exact flags on ashr/lshr produce poison
- - NSW/NUW flags on shl produce poison
- - Inbounds flag on getelementptr produce poison
- - fptosi/fptoui (out of bounds input) produce poison
- - Scalable vector types for insertelement/extractelement
- - Floating point binary ops w/fmf nnan/noinfs flags produce poison
- */
diff --git a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
index 88cb04695217..5ef6ffb58a7c 100644
--- a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
@@ -17,12 +17,16 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
#include "llvm/Demangle/Demangle.h"
#include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h"
using namespace llvm;
+const char kRtsanModuleCtorName[] = "rtsan.module_ctor";
+const char kRtsanInitName[] = "__rtsan_ensure_initialized";
+
static SmallVector<Type *> getArgTypes(ArrayRef<Value *> FunctionArgs) {
SmallVector<Type *> Types;
for (Value *Arg : FunctionArgs)
@@ -76,16 +80,22 @@ static PreservedAnalyses runSanitizeRealtimeBlocking(Function &Fn) {
return rtsanPreservedCFGAnalyses();
}
-RealtimeSanitizerPass::RealtimeSanitizerPass(
- const RealtimeSanitizerOptions &Options) {}
+PreservedAnalyses RealtimeSanitizerPass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ getOrCreateSanitizerCtorAndInitFunctions(
+ M, kRtsanModuleCtorName, kRtsanInitName, /*InitArgTypes=*/{},
+ /*InitArgs=*/{},
+ // This callback is invoked when the functions are created the first
+ // time. Hook them into the global ctors list in that case:
+ [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
-PreservedAnalyses RealtimeSanitizerPass::run(Function &Fn,
- AnalysisManager<Function> &AM) {
- if (Fn.hasFnAttribute(Attribute::SanitizeRealtime))
- return runSanitizeRealtime(Fn);
+ for (Function &F : M) {
+ if (F.hasFnAttribute(Attribute::SanitizeRealtime))
+ runSanitizeRealtime(F);
- if (Fn.hasFnAttribute(Attribute::SanitizeRealtimeBlocking))
- return runSanitizeRealtimeBlocking(Fn);
+ if (F.hasFnAttribute(Attribute::SanitizeRealtimeBlocking))
+ runSanitizeRealtimeBlocking(F);
+ }
- return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
}
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 22acf59c78a3..ac033d92e30d 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -1045,10 +1045,8 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
->setCannotMerge(); // gets the PC using GET_CALLER_PC.
}
if (Options.TracePCGuard) {
- auto GuardPtr = IRB.CreateIntToPtr(
- IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
- ConstantInt::get(IntptrTy, Idx * 4)),
- PtrTy);
+ auto GuardPtr = IRB.CreateConstInBoundsGEP2_64(
+ FunctionGuardArray->getValueType(), FunctionGuardArray, 0, Idx);
if (Options.GatedCallbacks) {
Instruction *I = &*IP;
auto GateBranch = CreateGateBranch(F, FunctionGateCmp, I);
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
new file mode 100644
index 000000000000..19610958e47b
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -0,0 +1,898 @@
+//===----- TypeSanitizer.cpp - type-based-aliasing-violation detector -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of TypeSanitizer, a type-based-aliasing-violation
+// detector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/TypeSanitizer.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tysan"
+
+static const char *const kTysanModuleCtorName = "tysan.module_ctor";
+static const char *const kTysanInitName = "__tysan_init";
+static const char *const kTysanCheckName = "__tysan_check";
+static const char *const kTysanGVNamePrefix = "__tysan_v1_";
+
+static const char *const kTysanShadowMemoryAddress =
+ "__tysan_shadow_memory_address";
+static const char *const kTysanAppMemMask = "__tysan_app_memory_mask";
+
+static cl::opt<bool>
+ ClWritesAlwaysSetType("tysan-writes-always-set-type",
+ cl::desc("Writes always set the type"), cl::Hidden,
+ cl::init(false));
+
+STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses");
+
+namespace {
+
+/// TypeSanitizer: instrument the code in module to find type-based aliasing
+/// violations.
+struct TypeSanitizer {
+ TypeSanitizer(Module &M);
+ bool run(Function &F, const TargetLibraryInfo &TLI);
+ void instrumentGlobals(Module &M);
+
+private:
+ typedef SmallDenseMap<const MDNode *, GlobalVariable *, 8>
+ TypeDescriptorsMapTy;
+ typedef SmallDenseMap<const MDNode *, std::string, 8> TypeNameMapTy;
+
+ void initializeCallbacks(Module &M);
+
+ Instruction *getShadowBase(Function &F);
+ Instruction *getAppMemMask(Function &F);
+
+ bool instrumentWithShadowUpdate(IRBuilder<> &IRB, const MDNode *TBAAMD,
+ Value *Ptr, uint64_t AccessSize, bool IsRead,
+ bool IsWrite, Value *ShadowBase,
+ Value *AppMemMask, bool ForceSetType,
+ bool SanitizeFunction,
+ TypeDescriptorsMapTy &TypeDescriptors,
+ const DataLayout &DL);
+
+ /// Memory-related intrinsics/instructions reset the type of the destination
+ /// memory (including allocas and byval arguments).
+ bool instrumentMemInst(Value *I, Instruction *ShadowBase,
+ Instruction *AppMemMask, const DataLayout &DL);
+
+ std::string getAnonymousStructIdentifier(const MDNode *MD,
+ TypeNameMapTy &TypeNames);
+ bool generateTypeDescriptor(const MDNode *MD,
+ TypeDescriptorsMapTy &TypeDescriptors,
+ TypeNameMapTy &TypeNames, Module &M);
+ bool generateBaseTypeDescriptor(const MDNode *MD,
+ TypeDescriptorsMapTy &TypeDescriptors,
+ TypeNameMapTy &TypeNames, Module &M);
+
+ const Triple TargetTriple;
+ Regex AnonNameRegex;
+ Type *IntptrTy;
+ uint64_t PtrShift;
+ IntegerType *OrdTy;
+
+ /// Callbacks to run-time library are computed in initializeCallbacks.
+ FunctionCallee TysanCheck;
+ FunctionCallee TysanCtorFunction;
+
+ /// Callback to set types for gloabls.
+ Function *TysanGlobalsSetTypeFunction;
+};
+} // namespace
+
+TypeSanitizer::TypeSanitizer(Module &M)
+ : TargetTriple(Triple(M.getTargetTriple())),
+ AnonNameRegex("^_ZTS.*N[1-9][0-9]*_GLOBAL__N") {
+ const DataLayout &DL = M.getDataLayout();
+ IntptrTy = DL.getIntPtrType(M.getContext());
+ PtrShift = countr_zero(IntptrTy->getPrimitiveSizeInBits() / 8);
+
+ TysanGlobalsSetTypeFunction = M.getFunction("__tysan_set_globals_types");
+ initializeCallbacks(M);
+}
+
+void TypeSanitizer::initializeCallbacks(Module &M) {
+ IRBuilder<> IRB(M.getContext());
+ OrdTy = IRB.getInt32Ty();
+
+ AttributeList Attr;
+ Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
+ // Initialize the callbacks.
+ TysanCheck =
+ M.getOrInsertFunction(kTysanCheckName, Attr, IRB.getVoidTy(),
+ IRB.getPtrTy(), // Pointer to data to be read.
+ OrdTy, // Size of the data in bytes.
+ IRB.getPtrTy(), // Pointer to type descriptor.
+ OrdTy // Flags.
+ );
+
+ TysanCtorFunction =
+ M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy());
+}
+
+void TypeSanitizer::instrumentGlobals(Module &M) {
+ TysanGlobalsSetTypeFunction = nullptr;
+
+ NamedMDNode *Globals = M.getNamedMetadata("llvm.tysan.globals");
+ if (!Globals)
+ return;
+
+ TysanGlobalsSetTypeFunction = Function::Create(
+ FunctionType::get(Type::getVoidTy(M.getContext()), false),
+ GlobalValue::InternalLinkage, "__tysan_set_globals_types", &M);
+ BasicBlock *BB =
+ BasicBlock::Create(M.getContext(), "", TysanGlobalsSetTypeFunction);
+ ReturnInst::Create(M.getContext(), BB);
+
+ const DataLayout &DL = M.getDataLayout();
+ Value *ShadowBase = getShadowBase(*TysanGlobalsSetTypeFunction);
+ Value *AppMemMask = getAppMemMask(*TysanGlobalsSetTypeFunction);
+ TypeDescriptorsMapTy TypeDescriptors;
+ TypeNameMapTy TypeNames;
+
+ for (const auto &GMD : Globals->operands()) {
+ auto *GV = mdconst::dyn_extract_or_null<GlobalVariable>(GMD->getOperand(0));
+ if (!GV)
+ continue;
+ const MDNode *TBAAMD = cast<MDNode>(GMD->getOperand(1));
+ if (!generateBaseTypeDescriptor(TBAAMD, TypeDescriptors, TypeNames, M))
+ continue;
+
+ IRBuilder<> IRB(
+ TysanGlobalsSetTypeFunction->getEntryBlock().getTerminator());
+ Type *AccessTy = GV->getValueType();
+ assert(AccessTy->isSized());
+ uint64_t AccessSize = DL.getTypeStoreSize(AccessTy);
+ instrumentWithShadowUpdate(IRB, TBAAMD, GV, AccessSize, false, false,
+ ShadowBase, AppMemMask, true, false,
+ TypeDescriptors, DL);
+ }
+
+ if (TysanGlobalsSetTypeFunction) {
+ IRBuilder<> IRB(cast<Function>(TysanCtorFunction.getCallee())
+ ->getEntryBlock()
+ .getTerminator());
+ IRB.CreateCall(TysanGlobalsSetTypeFunction, {});
+ }
+}
+
+static const char LUT[] = "0123456789abcdef";
+
+static std::string encodeName(StringRef Name) {
+ size_t Length = Name.size();
+ std::string Output = kTysanGVNamePrefix;
+ Output.reserve(Output.size() + 3 * Length);
+ for (size_t i = 0; i < Length; ++i) {
+ const unsigned char c = Name[i];
+ if (isalnum(c)) {
+ Output.push_back(c);
+ continue;
+ }
+
+ if (c == '_') {
+ Output.append("__");
+ continue;
+ }
+
+ Output.push_back('_');
+ Output.push_back(LUT[c >> 4]);
+ Output.push_back(LUT[c & 15]);
+ }
+
+ return Output;
+}
+
+std::string
+TypeSanitizer::getAnonymousStructIdentifier(const MDNode *MD,
+ TypeNameMapTy &TypeNames) {
+ MD5 Hash;
+
+ for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) {
+ const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i));
+ if (!MemberNode)
+ return "";
+
+ auto TNI = TypeNames.find(MemberNode);
+ std::string MemberName;
+ if (TNI != TypeNames.end()) {
+ MemberName = TNI->second;
+ } else {
+ if (MemberNode->getNumOperands() < 1)
+ return "";
+ MDString *MemberNameNode = dyn_cast<MDString>(MemberNode->getOperand(0));
+ if (!MemberNameNode)
+ return "";
+ MemberName = MemberNameNode->getString().str();
+ if (MemberName.empty())
+ MemberName = getAnonymousStructIdentifier(MemberNode, TypeNames);
+ if (MemberName.empty())
+ return "";
+ TypeNames[MemberNode] = MemberName;
+ }
+
+ Hash.update(MemberName);
+ Hash.update("\0");
+
+ uint64_t Offset =
+ mdconst::extract<ConstantInt>(MD->getOperand(i + 1))->getZExtValue();
+ Hash.update(utostr(Offset));
+ Hash.update("\0");
+ }
+
+ MD5::MD5Result HashResult;
+ Hash.final(HashResult);
+ return "__anonymous_" + std::string(HashResult.digest().str());
+}
+
+bool TypeSanitizer::generateBaseTypeDescriptor(
+ const MDNode *MD, TypeDescriptorsMapTy &TypeDescriptors,
+ TypeNameMapTy &TypeNames, Module &M) {
+ if (MD->getNumOperands() < 1)
+ return false;
+
+ MDString *NameNode = dyn_cast<MDString>(MD->getOperand(0));
+ if (!NameNode)
+ return false;
+
+ std::string Name = NameNode->getString().str();
+ if (Name.empty())
+ Name = getAnonymousStructIdentifier(MD, TypeNames);
+ if (Name.empty())
+ return false;
+ TypeNames[MD] = Name;
+ std::string EncodedName = encodeName(Name);
+
+ GlobalVariable *GV =
+ dyn_cast_or_null<GlobalVariable>(M.getNamedValue(EncodedName));
+ if (GV) {
+ TypeDescriptors[MD] = GV;
+ return true;
+ }
+
+ SmallVector<std::pair<Constant *, uint64_t>> Members;
+ for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) {
+ const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i));
+ if (!MemberNode)
+ return false;
+
+ Constant *Member;
+ auto TDI = TypeDescriptors.find(MemberNode);
+ if (TDI != TypeDescriptors.end()) {
+ Member = TDI->second;
+ } else {
+ if (!generateBaseTypeDescriptor(MemberNode, TypeDescriptors, TypeNames,
+ M))
+ return false;
+
+ Member = TypeDescriptors[MemberNode];
+ }
+
+ uint64_t Offset =
+ mdconst::extract<ConstantInt>(MD->getOperand(i + 1))->getZExtValue();
+
+ Members.push_back(std::make_pair(Member, Offset));
+ }
+
+ // The descriptor for a scalar is:
+ // [2, member count, [type pointer, offset]..., name]
+
+ LLVMContext &C = MD->getContext();
+ Constant *NameData = ConstantDataArray::getString(C, NameNode->getString());
+ SmallVector<Type *> TDSubTys;
+ SmallVector<Constant *> TDSubData;
+
+ auto PushTDSub = [&](Constant *C) {
+ TDSubTys.push_back(C->getType());
+ TDSubData.push_back(C);
+ };
+
+ PushTDSub(ConstantInt::get(IntptrTy, 2));
+ PushTDSub(ConstantInt::get(IntptrTy, Members.size()));
+
+ // Types that are in an anonymous namespace are local to this module.
+ // FIXME: This should really be marked by the frontend in the metadata
+ // instead of having us guess this from the mangled name. Moreover, the regex
+ // here can pick up (unlikely) names in the non-reserved namespace (because
+ // it needs to search into the type to pick up cases where the type in the
+ // anonymous namespace is a template parameter, etc.).
+ bool ShouldBeComdat = !AnonNameRegex.match(NameNode->getString());
+ for (auto &Member : Members) {
+ PushTDSub(Member.first);
+ PushTDSub(ConstantInt::get(IntptrTy, Member.second));
+ }
+
+ PushTDSub(NameData);
+
+ StructType *TDTy = StructType::get(C, TDSubTys);
+ Constant *TD = ConstantStruct::get(TDTy, TDSubData);
+
+ GlobalVariable *TDGV =
+ new GlobalVariable(TDTy, true,
+ !ShouldBeComdat ? GlobalValue::InternalLinkage
+ : GlobalValue::LinkOnceODRLinkage,
+ TD, EncodedName);
+ M.insertGlobalVariable(TDGV);
+
+ if (ShouldBeComdat) {
+ if (TargetTriple.isOSBinFormatELF()) {
+ Comdat *TDComdat = M.getOrInsertComdat(EncodedName);
+ TDGV->setComdat(TDComdat);
+ }
+ appendToUsed(M, TDGV);
+ }
+
+ TypeDescriptors[MD] = TDGV;
+ return true;
+}
+
+bool TypeSanitizer::generateTypeDescriptor(
+ const MDNode *MD, TypeDescriptorsMapTy &TypeDescriptors,
+ TypeNameMapTy &TypeNames, Module &M) {
+ // Here we need to generate a type descriptor corresponding to this TBAA
+ // metadata node. Under the current scheme there are three kinds of TBAA
+ // metadata nodes: scalar nodes, struct nodes, and struct tag nodes.
+
+ if (MD->getNumOperands() < 3)
+ return false;
+
+ const MDNode *BaseNode = dyn_cast<MDNode>(MD->getOperand(0));
+ if (!BaseNode)
+ return false;
+
+ // This is a struct tag (element-access) node.
+
+ const MDNode *AccessNode = dyn_cast<MDNode>(MD->getOperand(1));
+ if (!AccessNode)
+ return false;
+
+ Constant *Base;
+ auto TDI = TypeDescriptors.find(BaseNode);
+ if (TDI != TypeDescriptors.end()) {
+ Base = TDI->second;
+ } else {
+ if (!generateBaseTypeDescriptor(BaseNode, TypeDescriptors, TypeNames, M))
+ return false;
+
+ Base = TypeDescriptors[BaseNode];
+ }
+
+ Constant *Access;
+ TDI = TypeDescriptors.find(AccessNode);
+ if (TDI != TypeDescriptors.end()) {
+ Access = TDI->second;
+ } else {
+ if (!generateBaseTypeDescriptor(AccessNode, TypeDescriptors, TypeNames, M))
+ return false;
+
+ Access = TypeDescriptors[AccessNode];
+ }
+
+ uint64_t Offset =
+ mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue();
+ std::string EncodedName =
+ std::string(Base->getName()) + "_o_" + utostr(Offset);
+
+ GlobalVariable *GV =
+ dyn_cast_or_null<GlobalVariable>(M.getNamedValue(EncodedName));
+ if (GV) {
+ TypeDescriptors[MD] = GV;
+ return true;
+ }
+
+ // The descriptor for a scalar is:
+ // [1, base-type pointer, access-type pointer, offset]
+
+ StructType *TDTy =
+ StructType::get(IntptrTy, Base->getType(), Access->getType(), IntptrTy);
+ Constant *TD =
+ ConstantStruct::get(TDTy, ConstantInt::get(IntptrTy, 1), Base, Access,
+ ConstantInt::get(IntptrTy, Offset));
+
+ bool ShouldBeComdat = cast<GlobalVariable>(Base)->getLinkage() ==
+ GlobalValue::LinkOnceODRLinkage;
+
+ GlobalVariable *TDGV =
+ new GlobalVariable(TDTy, true,
+ !ShouldBeComdat ? GlobalValue::InternalLinkage
+ : GlobalValue::LinkOnceODRLinkage,
+ TD, EncodedName);
+ M.insertGlobalVariable(TDGV);
+
+ if (ShouldBeComdat) {
+ if (TargetTriple.isOSBinFormatELF()) {
+ Comdat *TDComdat = M.getOrInsertComdat(EncodedName);
+ TDGV->setComdat(TDComdat);
+ }
+ appendToUsed(M, TDGV);
+ }
+
+ TypeDescriptors[MD] = TDGV;
+ return true;
+}
+
+Instruction *TypeSanitizer::getShadowBase(Function &F) {
+ IRBuilder<> IRB(&F.front().front());
+ Constant *GlobalShadowAddress =
+ F.getParent()->getOrInsertGlobal(kTysanShadowMemoryAddress, IntptrTy);
+ return IRB.CreateLoad(IntptrTy, GlobalShadowAddress, "shadow.base");
+}
+
+Instruction *TypeSanitizer::getAppMemMask(Function &F) {
+ IRBuilder<> IRB(&F.front().front());
+ Value *GlobalAppMemMask =
+ F.getParent()->getOrInsertGlobal(kTysanAppMemMask, IntptrTy);
+ return IRB.CreateLoad(IntptrTy, GlobalAppMemMask, "app.mem.mask");
+}
+
+/// Collect all loads and stores, and for what TBAA nodes we need to generate
+/// type descriptors.
+void collectMemAccessInfo(
+ Function &F, const TargetLibraryInfo &TLI,
+ SmallVectorImpl<std::pair<Instruction *, MemoryLocation>> &MemoryAccesses,
+ SmallSetVector<const MDNode *, 8> &TBAAMetadata,
+ SmallVectorImpl<Value *> &MemTypeResetInsts) {
+ // Traverse all instructions, collect loads/stores/returns, check for calls.
+ for (Instruction &Inst : instructions(F)) {
+ // Skip memory accesses inserted by another instrumentation.
+ if (Inst.getMetadata(LLVMContext::MD_nosanitize))
+ continue;
+
+ if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst) ||
+ isa<AtomicCmpXchgInst>(Inst) || isa<AtomicRMWInst>(Inst)) {
+ MemoryLocation MLoc = MemoryLocation::get(&Inst);
+
+ // Swift errors are special (we can't introduce extra uses on them).
+ if (MLoc.Ptr->isSwiftError())
+ continue;
+
+ // Skip non-address-space-0 pointers; we don't know how to handle them.
+ Type *PtrTy = cast<PointerType>(MLoc.Ptr->getType());
+ if (PtrTy->getPointerAddressSpace() != 0)
+ continue;
+
+ if (MLoc.AATags.TBAA)
+ TBAAMetadata.insert(MLoc.AATags.TBAA);
+ MemoryAccesses.push_back(std::make_pair(&Inst, MLoc));
+ } else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+ if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+ maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
+
+ if (isa<MemIntrinsic>(Inst)) {
+ MemTypeResetInsts.push_back(&Inst);
+ } else if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+ II->getIntrinsicID() == Intrinsic::lifetime_end)
+ MemTypeResetInsts.push_back(&Inst);
+ }
+ } else if (isa<AllocaInst>(Inst)) {
+ MemTypeResetInsts.push_back(&Inst);
+ }
+ }
+}
+
+bool TypeSanitizer::run(Function &F, const TargetLibraryInfo &TLI) {
+ // This is required to prevent instrumenting call to __tysan_init from within
+ // the module constructor.
+ if (&F == TysanCtorFunction.getCallee() || &F == TysanGlobalsSetTypeFunction)
+ return false;
+ initializeCallbacks(*F.getParent());
+
+ // We need to collect all loads and stores, and know for what TBAA nodes we
+ // need to generate type descriptors.
+ SmallVector<std::pair<Instruction *, MemoryLocation>> MemoryAccesses;
+ SmallSetVector<const MDNode *, 8> TBAAMetadata;
+ SmallVector<Value *> MemTypeResetInsts;
+ collectMemAccessInfo(F, TLI, MemoryAccesses, TBAAMetadata, MemTypeResetInsts);
+
+ // byval arguments also need their types reset (they're new stack memory,
+ // just like allocas).
+ for (auto &A : F.args())
+ if (A.hasByValAttr())
+ MemTypeResetInsts.push_back(&A);
+
+ Module &M = *F.getParent();
+ TypeDescriptorsMapTy TypeDescriptors;
+ TypeNameMapTy TypeNames;
+ bool Res = false;
+ for (const MDNode *MD : TBAAMetadata) {
+ if (TypeDescriptors.count(MD))
+ continue;
+
+ if (!generateTypeDescriptor(MD, TypeDescriptors, TypeNames, M))
+ return Res; // Giving up.
+
+ Res = true;
+ }
+
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeType);
+ bool NeedsInstrumentation =
+ MemTypeResetInsts.empty() && MemoryAccesses.empty();
+ Instruction *ShadowBase = NeedsInstrumentation ? nullptr : getShadowBase(F);
+ Instruction *AppMemMask = NeedsInstrumentation ? nullptr : getAppMemMask(F);
+ for (const auto &[I, MLoc] : MemoryAccesses) {
+ IRBuilder<> IRB(I);
+ assert(MLoc.Size.isPrecise());
+ if (instrumentWithShadowUpdate(
+ IRB, MLoc.AATags.TBAA, const_cast<Value *>(MLoc.Ptr),
+ MLoc.Size.getValue(), I->mayReadFromMemory(), I->mayWriteToMemory(),
+ ShadowBase, AppMemMask, false, SanitizeFunction, TypeDescriptors,
+ DL)) {
+ ++NumInstrumentedAccesses;
+ Res = true;
+ }
+ }
+
+ for (auto Inst : MemTypeResetInsts)
+ Res |= instrumentMemInst(Inst, ShadowBase, AppMemMask, DL);
+
+ return Res;
+}
+
+static Value *convertToShadowDataInt(IRBuilder<> &IRB, Value *Ptr,
+ Type *IntptrTy, uint64_t PtrShift,
+ Value *ShadowBase, Value *AppMemMask) {
+ return IRB.CreateAdd(
+ IRB.CreateShl(
+ IRB.CreateAnd(IRB.CreatePtrToInt(Ptr, IntptrTy, "app.ptr.int"),
+ AppMemMask, "app.ptr.masked"),
+ PtrShift, "app.ptr.shifted"),
+ ShadowBase, "shadow.ptr.int");
+}
+
+bool TypeSanitizer::instrumentWithShadowUpdate(
+ IRBuilder<> &IRB, const MDNode *TBAAMD, Value *Ptr, uint64_t AccessSize,
+ bool IsRead, bool IsWrite, Value *ShadowBase, Value *AppMemMask,
+ bool ForceSetType, bool SanitizeFunction,
+ TypeDescriptorsMapTy &TypeDescriptors, const DataLayout &DL) {
+ Constant *TDGV;
+ if (TBAAMD)
+ TDGV = TypeDescriptors[TBAAMD];
+ else
+ TDGV = Constant::getNullValue(IRB.getPtrTy());
+
+ Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy());
+
+ Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift,
+ ShadowBase, AppMemMask);
+ Type *Int8PtrPtrTy = PointerType::get(IRB.getPtrTy(), 0);
+ Value *ShadowData =
+ IRB.CreateIntToPtr(ShadowDataInt, Int8PtrPtrTy, "shadow.ptr");
+
+ auto SetType = [&]() {
+ IRB.CreateStore(TD, ShadowData);
+
+ // Now fill the remainder of the shadow memory corresponding to the
+ // remainder of the the bytes of the type with a bad type descriptor.
+ for (uint64_t i = 1; i < AccessSize; ++i) {
+ Value *BadShadowData = IRB.CreateIntToPtr(
+ IRB.CreateAdd(ShadowDataInt,
+ ConstantInt::get(IntptrTy, i << PtrShift),
+ "shadow.byte." + Twine(i) + ".offset"),
+ Int8PtrPtrTy, "shadow.byte." + Twine(i) + ".ptr");
+
+ // This is the TD value, -i, which is used to indicate that the byte is
+ // i bytes after the first byte of the type.
+ Value *BadTD =
+ IRB.CreateIntToPtr(ConstantInt::getSigned(IntptrTy, -i),
+ IRB.getPtrTy(), "bad.descriptor" + Twine(i));
+ IRB.CreateStore(BadTD, BadShadowData);
+ }
+ };
+
+ if (ForceSetType || (ClWritesAlwaysSetType && IsWrite)) {
+ // In the mode where writes always set the type, for a write (which does
+ // not also read), we just set the type.
+ SetType();
+ return true;
+ }
+
+ assert((!ClWritesAlwaysSetType || IsRead) &&
+ "should have handled case above");
+ LLVMContext &C = IRB.getContext();
+ MDNode *UnlikelyBW = MDBuilder(C).createBranchWeights(1, 100000);
+
+ if (!SanitizeFunction) {
+ // If we're not sanitizing this function, then we only care whether we
+ // need to *set* the type.
+ Value *LoadedTD = IRB.CreateLoad(IRB.getPtrTy(), ShadowData, "shadow.desc");
+ Value *NullTDCmp = IRB.CreateIsNull(LoadedTD, "desc.set");
+ Instruction *NullTDTerm = SplitBlockAndInsertIfThen(
+ NullTDCmp, &*IRB.GetInsertPoint(), false, UnlikelyBW);
+ IRB.SetInsertPoint(NullTDTerm);
+ NullTDTerm->getParent()->setName("set.type");
+ SetType();
+ return true;
+ }
+ // We need to check the type here. If the type is unknown, then the read
+ // sets the type. If the type is known, then it is checked. If the type
+ // doesn't match, then we call the runtime (which may yet determine that
+ // the mismatch is okay).
+ //
+ // The checks generated below have the following strucutre.
+ //
+ // ; First we load the descriptor for the load from shadow memory and
+ // ; compare it against the type descriptor for the current access type.
+ // %shadow.desc = load ptr %shadow.data
+ // %bad.desc = icmp ne %shadow.desc, %td
+ // br %bad.desc, %bad.bb, %good.bb
+ //
+ // bad.bb:
+ // %shadow.desc.null = icmp eq %shadow.desc, null
+ // br %shadow.desc.null, %null.td.bb, %good.td.bb
+ //
+ // null.td.bb:
+ // ; The typ is unknown, set it if all bytes in the value are also unknown.
+ // ; To check, we load the shadow data for all bytes of the access. For the
+ // ; pseudo code below, assume an access of size 1.
+ // %shadow.data.int = add %shadow.data.int, 0
+ // %l = load (inttoptr %shadow.data.int)
+ // %is.not.null = icmp ne %l, null
+ // %not.all.unknown = %is.not.null
+ // br %no.all.unknown, before.set.type.bb
+ //
+ // before.set.type.bb:
+ // ; Call runtime to check mismatch.
+ // call void @__tysan_check()
+ // br %set.type.bb
+ //
+ // set.type.bb:
+ // ; Now fill the remainder of the shadow memory corresponding to the
+ // ; remainder of the the bytes of the type with a bad type descriptor.
+ // store %TD, %shadow.data
+ // br %continue.bb
+ //
+ // good.td.bb::
+ // ; We have a non-trivial mismatch. Call the runtime.
+ // call void @__tysan_check()
+ // br %continue.bb
+ //
+ // good.bb:
+ // ; We appear to have the right type. Make sure that all other bytes in
+ // ; the type are still marked as interior bytes. If not, call the runtime.
+ // %shadow.data.int = add %shadow.data.int, 0
+ // %l = load (inttoptr %shadow.data.int)
+ // %not.all.interior = icmp sge %l, 0
+ // br %not.all.interior, label %check.rt.bb, label %continue.bb
+ //
+ // check.rt.bb:
+ // call void @__tysan_check()
+ // br %continue.bb
+
+ Constant *Flags = ConstantInt::get(OrdTy, int(IsRead) | (int(IsWrite) << 1));
+
+ Value *LoadedTD = IRB.CreateLoad(IRB.getPtrTy(), ShadowData, "shadow.desc");
+ Value *BadTDCmp = IRB.CreateICmpNE(LoadedTD, TD, "bad.desc");
+ Instruction *BadTDTerm, *GoodTDTerm;
+ SplitBlockAndInsertIfThenElse(BadTDCmp, &*IRB.GetInsertPoint(), &BadTDTerm,
+ &GoodTDTerm, UnlikelyBW);
+ IRB.SetInsertPoint(BadTDTerm);
+
+ // We now know that the types did not match (we're on the slow path). If
+ // the type is unknown, then set it.
+ Value *NullTDCmp = IRB.CreateIsNull(LoadedTD);
+ Instruction *NullTDTerm, *MismatchTerm;
+ SplitBlockAndInsertIfThenElse(NullTDCmp, &*IRB.GetInsertPoint(), &NullTDTerm,
+ &MismatchTerm);
+
+ // If the type is unknown, then set the type.
+ IRB.SetInsertPoint(NullTDTerm);
+
+ // We're about to set the type. Make sure that all bytes in the value are
+ // also of unknown type.
+ Value *Size = ConstantInt::get(OrdTy, AccessSize);
+ Value *NotAllUnkTD = IRB.getFalse();
+ for (uint64_t i = 1; i < AccessSize; ++i) {
+ Value *UnkShadowData = IRB.CreateIntToPtr(
+ IRB.CreateAdd(ShadowDataInt, ConstantInt::get(IntptrTy, i << PtrShift)),
+ Int8PtrPtrTy);
+ Value *ILdTD = IRB.CreateLoad(IRB.getPtrTy(), UnkShadowData);
+ NotAllUnkTD = IRB.CreateOr(NotAllUnkTD, IRB.CreateIsNotNull(ILdTD));
+ }
+
+ Instruction *BeforeSetType = &*IRB.GetInsertPoint();
+ Instruction *BadUTDTerm =
+ SplitBlockAndInsertIfThen(NotAllUnkTD, BeforeSetType, false, UnlikelyBW);
+ IRB.SetInsertPoint(BadUTDTerm);
+ IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()), Size,
+ (Value *)TD, (Value *)Flags});
+
+ IRB.SetInsertPoint(BeforeSetType);
+ SetType();
+
+ // We have a non-trivial mismatch. Call the runtime.
+ IRB.SetInsertPoint(MismatchTerm);
+ IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()), Size,
+ (Value *)TD, (Value *)Flags});
+
+ // We appear to have the right type. Make sure that all other bytes in
+ // the type are still marked as interior bytes. If not, call the runtime.
+ IRB.SetInsertPoint(GoodTDTerm);
+ Value *NotAllBadTD = IRB.getFalse();
+ for (uint64_t i = 1; i < AccessSize; ++i) {
+ Value *BadShadowData = IRB.CreateIntToPtr(
+ IRB.CreateAdd(ShadowDataInt, ConstantInt::get(IntptrTy, i << PtrShift)),
+ Int8PtrPtrTy);
+ Value *ILdTD = IRB.CreatePtrToInt(
+ IRB.CreateLoad(IRB.getPtrTy(), BadShadowData), IntptrTy);
+ NotAllBadTD = IRB.CreateOr(
+ NotAllBadTD, IRB.CreateICmpSGE(ILdTD, ConstantInt::get(IntptrTy, 0)));
+ }
+
+ Instruction *BadITDTerm = SplitBlockAndInsertIfThen(
+ NotAllBadTD, &*IRB.GetInsertPoint(), false, UnlikelyBW);
+ IRB.SetInsertPoint(BadITDTerm);
+ IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()), Size,
+ (Value *)TD, (Value *)Flags});
+ return true;
+}
+
+bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
+ Instruction *AppMemMask,
+ const DataLayout &DL) {
+ BasicBlock::iterator IP;
+ BasicBlock *BB;
+ Function *F;
+
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ IP = BasicBlock::iterator(I);
+ BB = I->getParent();
+ F = BB->getParent();
+ } else {
+ auto *A = cast<Argument>(V);
+ F = A->getParent();
+ BB = &F->getEntryBlock();
+ IP = BB->getFirstInsertionPt();
+
+ // Find the next insert point after both ShadowBase and AppMemMask.
+ if (IP->comesBefore(ShadowBase))
+ IP = ShadowBase->getNextNode()->getIterator();
+ if (IP->comesBefore(AppMemMask))
+ IP = AppMemMask->getNextNode()->getIterator();
+ }
+
+ Value *Dest, *Size, *Src = nullptr;
+ bool NeedsMemMove = false;
+ IRBuilder<> IRB(BB, IP);
+
+ if (auto *A = dyn_cast<Argument>(V)) {
+ assert(A->hasByValAttr() && "Type reset for non-byval argument?");
+
+ Dest = A;
+ Size =
+ ConstantInt::get(IntptrTy, DL.getTypeAllocSize(A->getParamByValType()));
+ } else {
+ auto *I = cast<Instruction>(V);
+ if (auto *MI = dyn_cast<MemIntrinsic>(I)) {
+ if (MI->getDestAddressSpace() != 0)
+ return false;
+
+ Dest = MI->getDest();
+ Size = MI->getLength();
+
+ if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+ if (MTI->getSourceAddressSpace() == 0) {
+ Src = MTI->getSource();
+ NeedsMemMove = isa<MemMoveInst>(MTI);
+ }
+ }
+ } else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+ II->getIntrinsicID() != Intrinsic::lifetime_end)
+ return false;
+
+ Size = II->getArgOperand(0);
+ Dest = II->getArgOperand(1);
+ } else if (auto *AI = dyn_cast<AllocaInst>(I)) {
+ // We need to clear the types for new stack allocations (or else we might
+ // read stale type information from a previous function execution).
+
+ IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(I)));
+ IRB.SetInstDebugLocation(I);
+
+ Size = IRB.CreateMul(
+ IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy),
+ ConstantInt::get(IntptrTy,
+ DL.getTypeAllocSize(AI->getAllocatedType())));
+ Dest = I;
+ } else {
+ return false;
+ }
+ }
+
+ if (!ShadowBase)
+ ShadowBase = getShadowBase(*F);
+ if (!AppMemMask)
+ AppMemMask = getAppMemMask(*F);
+
+ Value *ShadowDataInt = IRB.CreateAdd(
+ IRB.CreateShl(
+ IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
+ PtrShift),
+ ShadowBase);
+ Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
+
+ if (!Src) {
+ IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift),
+ Align(1ull << PtrShift));
+ return true;
+ }
+
+ Value *SrcShadowDataInt = IRB.CreateAdd(
+ IRB.CreateShl(
+ IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
+ PtrShift),
+ ShadowBase);
+ Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
+
+ if (NeedsMemMove) {
+ IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+ Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+ } else {
+ IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+ Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+ }
+
+ return true;
+}
+
+PreservedAnalyses TypeSanitizerPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ TypeSanitizer TySan(*F.getParent());
+ TySan.run(F, FAM.getResult<TargetLibraryAnalysis>(F));
+ return PreservedAnalyses::none();
+}
+
+PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ Function *TysanCtorFunction;
+ std::tie(TysanCtorFunction, std::ignore) =
+ createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName,
+ kTysanInitName, /*InitArgTypes=*/{},
+ /*InitArgs=*/{});
+
+ TypeSanitizer TySan(M);
+ TySan.instrumentGlobals(M);
+ appendToGlobalCtors(M, TysanCtorFunction, 0);
+ return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index b8571ba07489..bbc7a005b9ff 100644
--- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -132,7 +132,7 @@ static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To,
if (!BI || !BI->isConditional())
return;
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *Cond = BI->getCondition();
if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
return;
@@ -142,7 +142,7 @@ static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To,
if (isCondRelevantToAnyCallArgument(Cmp, CB))
Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To
? Pred
- : Cmp->getInversePredicate()});
+ : Cmp->getInverseCmpPredicate()});
}
/// Record ICmp conditions relevant to any argument in CB following Pred's
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 8d1e793836c7..91a3c3f0d392 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -88,13 +88,12 @@ static Instruction *getContextInstForUse(Use &U) {
namespace {
/// Struct to express a condition of the form %Op0 Pred %Op1.
struct ConditionTy {
- CmpInst::Predicate Pred;
- Value *Op0;
- Value *Op1;
+ CmpPredicate Pred;
+ Value *Op0 = nullptr;
+ Value *Op1 = nullptr;
- ConditionTy()
- : Pred(CmpInst::BAD_ICMP_PREDICATE), Op0(nullptr), Op1(nullptr) {}
- ConditionTy(CmpInst::Predicate Pred, Value *Op0, Value *Op1)
+ ConditionTy() = default;
+ ConditionTy(CmpPredicate Pred, Value *Op0, Value *Op1)
: Pred(Pred), Op0(Op0), Op1(Op1) {}
};
@@ -132,18 +131,17 @@ struct FactOrCheck {
Ty(Ty) {}
FactOrCheck(DomTreeNode *DTN, Use *U)
- : U(U), DoesHold(CmpInst::BAD_ICMP_PREDICATE, nullptr, nullptr),
- NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()),
+ : U(U), NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()),
Ty(EntryTy::UseCheck) {}
- FactOrCheck(DomTreeNode *DTN, CmpInst::Predicate Pred, Value *Op0, Value *Op1,
- ConditionTy Precond = ConditionTy())
+ FactOrCheck(DomTreeNode *DTN, CmpPredicate Pred, Value *Op0, Value *Op1,
+ ConditionTy Precond = {})
: Cond(Pred, Op0, Op1), DoesHold(Precond), NumIn(DTN->getDFSNumIn()),
NumOut(DTN->getDFSNumOut()), Ty(EntryTy::ConditionFact) {}
- static FactOrCheck getConditionFact(DomTreeNode *DTN, CmpInst::Predicate Pred,
+ static FactOrCheck getConditionFact(DomTreeNode *DTN, CmpPredicate Pred,
Value *Op0, Value *Op1,
- ConditionTy Precond = ConditionTy()) {
+ ConditionTy Precond = {}) {
return FactOrCheck(DTN, Pred, Op0, Op1, Precond);
}
@@ -218,7 +216,7 @@ struct StackEntry {
StackEntry(unsigned NumIn, unsigned NumOut, bool IsSigned,
SmallVector<Value *, 2> ValuesToRelease)
: NumIn(NumIn), NumOut(NumOut), IsSigned(IsSigned),
- ValuesToRelease(ValuesToRelease) {}
+ ValuesToRelease(std::move(ValuesToRelease)) {}
};
struct ConstraintTy {
@@ -521,11 +519,21 @@ static Decomposition decompose(Value *V,
else if (match(V, m_NNegZExt(m_Value(Op0)))) {
V = Op0;
IsKnownNonNegative = true;
+ } else if (match(V, m_NSWTrunc(m_Value(Op0)))) {
+ if (Op0->getType()->getScalarSizeInBits() <= 64)
+ V = Op0;
}
if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1))))
return MergeResults(Op0, Op1, IsSigned);
+ if (match(V, m_NSWSub(m_Value(Op0), m_Value(Op1)))) {
+ auto ResA = decompose(Op0, Preconditions, IsSigned, DL);
+ auto ResB = decompose(Op1, Preconditions, IsSigned, DL);
+ ResA.sub(ResB);
+ return ResA;
+ }
+
ConstantInt *CI;
if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI))) && canUseSExt(CI)) {
auto Result = decompose(Op0, Preconditions, IsSigned, DL);
@@ -558,12 +566,19 @@ static Decomposition decompose(Value *V,
if (match(V, m_ZExt(m_Value(Op0)))) {
IsKnownNonNegative = true;
V = Op0;
- }
-
- if (match(V, m_SExt(m_Value(Op0)))) {
+ } else if (match(V, m_SExt(m_Value(Op0)))) {
V = Op0;
Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
ConstantInt::get(Op0->getType(), 0));
+ } else if (auto *Trunc = dyn_cast<TruncInst>(V)) {
+ if (Trunc->getSrcTy()->getScalarSizeInBits() <= 64) {
+ if (Trunc->hasNoUnsignedWrap() || Trunc->hasNoSignedWrap()) {
+ V = Trunc->getOperand(0);
+ if (!Trunc->hasNoUnsignedWrap())
+ Preconditions.emplace_back(CmpInst::ICMP_SGE, V,
+ ConstantInt::get(V->getType(), 0));
+ }
+ }
}
Value *Op1;
@@ -711,8 +726,8 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
}
for (const auto &KV : VariablesB) {
- if (SubOverflow(R[GetOrAddIndex(KV.Variable)], KV.Coefficient,
- R[GetOrAddIndex(KV.Variable)]))
+ auto &Coeff = R[GetOrAddIndex(KV.Variable)];
+ if (SubOverflow(Coeff, KV.Coefficient, Coeff))
return {};
auto I =
KnownNonNegativeVariables.insert({KV.Variable, KV.IsKnownNonNegative});
@@ -744,9 +759,9 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
if (!KV.second ||
(!Value2Index.contains(KV.first) && !NewIndexMap.contains(KV.first)))
continue;
- SmallVector<int64_t, 8> C(Value2Index.size() + NewVariables.size() + 1, 0);
+ auto &C = Res.ExtraInfo.emplace_back(
+ Value2Index.size() + NewVariables.size() + 1, 0);
C[GetOrAddIndex(KV.first)] = -1;
- Res.ExtraInfo.push_back(C);
}
return Res;
}
@@ -912,7 +927,7 @@ void State::addInfoForInductions(BasicBlock &BB) {
Value *A;
Value *B;
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(BB.getTerminator(),
m_Br(m_ICmp(Pred, m_Value(A), m_Value(B)), m_Value(), m_Value())))
@@ -1079,7 +1094,7 @@ void State::addInfoFor(BasicBlock &BB) {
switch (ID) {
case Intrinsic::assume: {
Value *A, *B;
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(I.getOperand(0), m_ICmp(Pred, m_Value(A), m_Value(B))))
break;
if (GuaranteedToExecute) {
@@ -1166,8 +1181,7 @@ void State::addInfoFor(BasicBlock &BB) {
if (auto *Cmp = dyn_cast<ICmpInst>(Cur)) {
WorkList.emplace_back(FactOrCheck::getConditionFact(
DT.getNode(Successor),
- IsOr ? CmpInst::getInversePredicate(Cmp->getPredicate())
- : Cmp->getPredicate(),
+ IsOr ? Cmp->getInverseCmpPredicate() : Cmp->getCmpPredicate(),
Cmp->getOperand(0), Cmp->getOperand(1)));
continue;
}
@@ -1191,13 +1205,12 @@ void State::addInfoFor(BasicBlock &BB) {
return;
if (canAddSuccessor(BB, Br->getSuccessor(0)))
WorkList.emplace_back(FactOrCheck::getConditionFact(
- DT.getNode(Br->getSuccessor(0)), CmpI->getPredicate(),
+ DT.getNode(Br->getSuccessor(0)), CmpI->getCmpPredicate(),
CmpI->getOperand(0), CmpI->getOperand(1)));
if (canAddSuccessor(BB, Br->getSuccessor(1)))
WorkList.emplace_back(FactOrCheck::getConditionFact(
- DT.getNode(Br->getSuccessor(1)),
- CmpInst::getInversePredicate(CmpI->getPredicate()), CmpI->getOperand(0),
- CmpI->getOperand(1)));
+ DT.getNode(Br->getSuccessor(1)), CmpI->getInverseCmpPredicate(),
+ CmpI->getOperand(0), CmpI->getOperand(1)));
}
#ifndef NDEBUG
@@ -1527,7 +1540,7 @@ static bool checkOrAndOpImpliedByOther(
while (!Worklist.empty()) {
Value *Val = Worklist.pop_back_val();
Value *LHS, *RHS;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (match(Val, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) {
// For OR, check if the negated condition implies CmpToCheck.
if (IsOr)
@@ -1578,53 +1591,52 @@ void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B,
LLVM_DEBUG(dbgs() << "Adding '"; dumpUnpackedICmp(dbgs(), Pred, A, B);
dbgs() << "'\n");
- bool Added = false;
auto &CSToUse = getCS(R.IsSigned);
if (R.Coefficients.empty())
return;
- Added |= CSToUse.addVariableRowFill(R.Coefficients);
+ bool Added = CSToUse.addVariableRowFill(R.Coefficients);
+ if (!Added)
+ return;
// If R has been added to the system, add the new variables and queue it for
// removal once it goes out-of-scope.
- if (Added) {
- SmallVector<Value *, 2> ValuesToRelease;
- auto &Value2Index = getValue2Index(R.IsSigned);
- for (Value *V : NewVariables) {
- Value2Index.insert({V, Value2Index.size() + 1});
- ValuesToRelease.push_back(V);
- }
-
- LLVM_DEBUG({
- dbgs() << " constraint: ";
- dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned));
- dbgs() << "\n";
- });
+ SmallVector<Value *, 2> ValuesToRelease;
+ auto &Value2Index = getValue2Index(R.IsSigned);
+ for (Value *V : NewVariables) {
+ Value2Index.insert({V, Value2Index.size() + 1});
+ ValuesToRelease.push_back(V);
+ }
- DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
- std::move(ValuesToRelease));
-
- if (!R.IsSigned) {
- for (Value *V : NewVariables) {
- ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0),
- false, false, false);
- VarPos.Coefficients[Value2Index[V]] = -1;
- CSToUse.addVariableRow(VarPos.Coefficients);
- DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
- SmallVector<Value *, 2>());
- }
- }
+ LLVM_DEBUG({
+ dbgs() << " constraint: ";
+ dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned));
+ dbgs() << "\n";
+ });
- if (R.isEq()) {
- // Also add the inverted constraint for equality constraints.
- for (auto &Coeff : R.Coefficients)
- Coeff *= -1;
- CSToUse.addVariableRowFill(R.Coefficients);
+ DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
+ std::move(ValuesToRelease));
+ if (!R.IsSigned) {
+ for (Value *V : NewVariables) {
+ ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0),
+ false, false, false);
+ VarPos.Coefficients[Value2Index[V]] = -1;
+ CSToUse.addVariableRow(VarPos.Coefficients);
DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
SmallVector<Value *, 2>());
}
}
+
+ if (R.isEq()) {
+ // Also add the inverted constraint for equality constraints.
+ for (auto &Coeff : R.Coefficients)
+ Coeff *= -1;
+ CSToUse.addVariableRowFill(R.Coefficients);
+
+ DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
+ SmallVector<Value *, 2>());
+ }
}
static bool replaceSubOverflowUses(IntrinsicInst *II, Value *A, Value *B,
@@ -1796,7 +1808,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
continue;
}
- auto AddFact = [&](CmpInst::Predicate Pred, Value *A, Value *B) {
+ auto AddFact = [&](CmpPredicate Pred, Value *A, Value *B) {
LLVM_DEBUG(dbgs() << "Processing fact to add to the system: ";
dumpUnpackedICmp(dbgs(), Pred, A, B); dbgs() << "\n");
if (Info.getCS(CmpInst::isSigned(Pred)).size() > MaxRows) {
@@ -1810,7 +1822,18 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
if (ReproducerModule && DFSInStack.size() > ReproducerCondStack.size())
ReproducerCondStack.emplace_back(Pred, A, B);
- Info.transferToOtherSystem(Pred, A, B, CB.NumIn, CB.NumOut, DFSInStack);
+ if (ICmpInst::isRelational(Pred)) {
+ // If samesign is present on the ICmp, simply flip the sign of the
+ // predicate, transferring the information from the signed system to the
+ // unsigned system, and viceversa.
+ if (Pred.hasSameSign())
+ Info.addFact(ICmpInst::getFlippedSignednessPredicate(Pred), A, B,
+ CB.NumIn, CB.NumOut, DFSInStack);
+ else
+ Info.transferToOtherSystem(Pred, A, B, CB.NumIn, CB.NumOut,
+ DFSInStack);
+ }
+
if (ReproducerModule && DFSInStack.size() > ReproducerCondStack.size()) {
// Add dummy entries to ReproducerCondStack to keep it in sync with
// DFSInStack.
@@ -1823,7 +1846,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
}
};
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!CB.isConditionFact()) {
Value *X;
if (match(CB.Inst, m_Intrinsic<Intrinsic::abs>(m_Value(X)))) {
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 3c4a40fab3e0..8a5c506eed69 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -109,7 +109,7 @@ static cl::opt<unsigned> MaxNumVisitiedPaths(
"dfa-max-num-visited-paths",
cl::desc(
"Max number of blocks visited while enumerating paths around a switch"),
- cl::Hidden, cl::init(2000));
+ cl::Hidden, cl::init(2500));
static cl::opt<unsigned>
MaxNumPaths("dfa-max-num-paths",
@@ -754,17 +754,15 @@ private:
return Res;
}
- /// Walk the use-def chain and collect all the state-defining instructions.
- ///
- /// Return an empty map if unpredictable values encountered inside the basic
- /// blocks of \p LoopPaths.
+ /// Walk the use-def chain and collect all the state-defining blocks and the
+ /// PHI nodes in those blocks that define the state.
StateDefMap getStateDefMap() const {
StateDefMap Res;
- Value *FirstDef = Switch->getOperand(0);
- assert(isa<PHINode>(FirstDef) && "The first definition must be a phi.");
+ PHINode *FirstDef = dyn_cast<PHINode>(Switch->getOperand(0));
+ assert(FirstDef && "The first definition must be a phi.");
SmallVector<PHINode *, 8> Stack;
- Stack.push_back(dyn_cast<PHINode>(FirstDef));
+ Stack.push_back(FirstDef);
SmallSet<Value *, 16> SeenValues;
while (!Stack.empty()) {
@@ -774,18 +772,15 @@ private:
SeenValues.insert(CurPhi);
for (BasicBlock *IncomingBB : CurPhi->blocks()) {
- Value *Incoming = CurPhi->getIncomingValueForBlock(IncomingBB);
+ PHINode *IncomingPhi =
+ dyn_cast<PHINode>(CurPhi->getIncomingValueForBlock(IncomingBB));
+ if (!IncomingPhi)
+ continue;
bool IsOutsideLoops = !SwitchOuterLoop->contains(IncomingBB);
- if (Incoming == FirstDef || isa<ConstantInt>(Incoming) ||
- SeenValues.contains(Incoming) || IsOutsideLoops) {
+ if (SeenValues.contains(IncomingPhi) || IsOutsideLoops)
continue;
- }
-
- // Any unpredictable value inside the loops means we must bail out.
- if (!isa<PHINode>(Incoming))
- return StateDefMap();
- Stack.push_back(cast<PHINode>(Incoming));
+ Stack.push_back(IncomingPhi);
}
}
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 5555b5e29cc7..cae5b9c41a37 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2054,7 +2054,7 @@ struct DSEState {
return false;
Instruction *ICmpL;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(BI->getCondition(),
m_c_ICmp(Pred,
m_CombineAnd(m_Load(m_Specific(StorePtr)),
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index cd4846e00603..3a0ae6b01a11 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -192,7 +192,7 @@ static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
// mechanism that may remove flags to increase the likelihood of CSE.
Flavor = SPF_UNKNOWN;
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(Cond, m_ICmp(Pred, m_Specific(A), m_Specific(B)))) {
// Check for commuted variants of min/max by swapping predicate.
@@ -279,7 +279,7 @@ static unsigned getHashValueImpl(SimpleValue Val) {
// Hash general selects to allow matching commuted true/false operands.
// If we do not have a compare as the condition, just hash in the condition.
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *X, *Y;
if (!match(Cond, m_Cmp(Pred, m_Value(X), m_Value(Y))))
return hash_combine(Inst->getOpcode(), Cond, A, B);
@@ -290,7 +290,8 @@ static unsigned getHashValueImpl(SimpleValue Val) {
Pred = CmpInst::getInversePredicate(Pred);
std::swap(A, B);
}
- return hash_combine(Inst->getOpcode(), Pred, X, Y, A, B);
+ return hash_combine(Inst->getOpcode(),
+ static_cast<CmpInst::Predicate>(Pred), X, Y, A, B);
}
if (CastInst *CI = dyn_cast<CastInst>(Inst))
@@ -451,7 +452,7 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
// this code, as we simplify the double-negation before hashing the second
// select (and so still succeed at CSEing them).
if (LHSA == RHSB && LHSB == RHSA) {
- CmpInst::Predicate PredL, PredR;
+ CmpPredicate PredL, PredR;
Value *X, *Y;
if (match(CondL, m_Cmp(PredL, m_Value(X), m_Value(Y))) &&
match(CondR, m_Cmp(PredR, m_Specific(X), m_Specific(Y))) &&
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index a8fda0c6ab9c..2978b7990a6e 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -727,7 +727,7 @@ GuardWideningImpl::mergeChecks(SmallVectorImpl<Value *> &ChecksToHoist,
// L >u C0 && L >u C1 -> L >u max(C0, C1)
ConstantInt *RHS0, *RHS1;
Value *LHS;
- ICmpInst::Predicate Pred0, Pred1;
+ CmpPredicate Pred0, Pred1;
// TODO: Support searching for pairs to merge from both whole lists of
// ChecksToHoist and ChecksToWiden.
if (ChecksToWiden.size() == 1 && ChecksToHoist.size() == 1 &&
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 0bc783412595..e706a6f83b1e 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -105,8 +105,8 @@ static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
cl::Hidden, cl::init(false));
-static cl::opt<unsigned> MinRuntimeIterations("irce-min-runtime-iterations",
- cl::Hidden, cl::init(10));
+static cl::opt<unsigned> MinEliminatedChecks("irce-min-eliminated-checks",
+ cl::Hidden, cl::init(10));
static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
cl::Hidden, cl::init(true));
@@ -130,15 +130,9 @@ static cl::opt<bool>
namespace {
-/// An inductive range check is conditional branch in a loop with
-///
-/// 1. a very cold successor (i.e. the branch jumps to that successor very
-/// rarely)
-///
-/// and
-///
-/// 2. a condition that is provably true for some contiguous range of values
-/// taken by the containing loop's induction variable.
+/// An inductive range check is conditional branch in a loop with a condition
+/// that is provably true for some contiguous range of values taken by the
+/// containing loop's induction variable.
///
class InductiveRangeCheck {
@@ -233,6 +227,7 @@ public:
/// checks, and hence don't end up in \p Checks.
static void extractRangeChecksFromBranch(
BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
+ std::optional<uint64_t> EstimatedTripCount,
SmallVectorImpl<InductiveRangeCheck> &Checks, bool &Changed);
};
@@ -246,9 +241,10 @@ class InductiveRangeCheckElimination {
std::optional<llvm::function_ref<llvm::BlockFrequencyInfo &()>>;
GetBFIFunc GetBFI;
- // Returns true if it is profitable to do a transform basing on estimation of
- // number of iterations.
- bool isProfitableToTransform(const Loop &L);
+ // Returns the estimated number of iterations based on block frequency info if
+ // available, or on branch probability info. Nullopt is returned if the number
+ // of iterations cannot be estimated.
+ std::optional<uint64_t> estimatedTripCount(const Loop &L);
public:
InductiveRangeCheckElimination(ScalarEvolution &SE,
@@ -522,6 +518,7 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
void InductiveRangeCheck::extractRangeChecksFromBranch(
BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
+ std::optional<uint64_t> EstimatedTripCount,
SmallVectorImpl<InductiveRangeCheck> &Checks, bool &Changed) {
if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
return;
@@ -529,11 +526,32 @@ void InductiveRangeCheck::extractRangeChecksFromBranch(
unsigned IndexLoopSucc = L->contains(BI->getSuccessor(0)) ? 0 : 1;
assert(L->contains(BI->getSuccessor(IndexLoopSucc)) &&
"No edges coming to loop?");
- BranchProbability LikelyTaken(15, 16);
- if (!SkipProfitabilityChecks && BPI &&
- BPI->getEdgeProbability(BI->getParent(), IndexLoopSucc) < LikelyTaken)
- return;
+ if (!SkipProfitabilityChecks && BPI) {
+ auto SuccessProbability =
+ BPI->getEdgeProbability(BI->getParent(), IndexLoopSucc);
+ if (EstimatedTripCount) {
+ auto EstimatedEliminatedChecks =
+ SuccessProbability.scale(*EstimatedTripCount);
+ if (EstimatedEliminatedChecks < MinEliminatedChecks) {
+ LLVM_DEBUG(dbgs() << "irce: could not prove profitability for branch "
+ << *BI << ": "
+ << "estimated eliminated checks too low "
+ << EstimatedEliminatedChecks << "\n";);
+ return;
+ }
+ } else {
+ BranchProbability LikelyTaken(15, 16);
+ if (SuccessProbability < LikelyTaken) {
+ LLVM_DEBUG(dbgs() << "irce: could not prove profitability for branch "
+ << *BI << ": "
+ << "could not estimate trip count "
+ << "and branch success probability too low "
+ << SuccessProbability << "\n";);
+ return;
+ }
+ }
+ }
// IRCE expects branch's true edge comes to loop. Invert branch for opposite
// case.
@@ -938,42 +956,34 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
return getLoopPassPreservedAnalyses();
}
-bool InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L) {
- if (SkipProfitabilityChecks)
- return true;
+std::optional<uint64_t>
+InductiveRangeCheckElimination::estimatedTripCount(const Loop &L) {
if (GetBFI) {
BlockFrequencyInfo &BFI = (*GetBFI)();
uint64_t hFreq = BFI.getBlockFreq(L.getHeader()).getFrequency();
uint64_t phFreq = BFI.getBlockFreq(L.getLoopPreheader()).getFrequency();
- if (phFreq != 0 && hFreq != 0 && (hFreq / phFreq < MinRuntimeIterations)) {
- LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
- << "the estimated number of iterations basing on "
- "frequency info is " << (hFreq / phFreq) << "\n";);
- return false;
- }
- return true;
+ if (phFreq == 0 || hFreq == 0)
+ return std::nullopt;
+ return {hFreq / phFreq};
}
if (!BPI)
- return true;
+ return std::nullopt;
auto *Latch = L.getLoopLatch();
if (!Latch)
- return true;
+ return std::nullopt;
auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
if (!LatchBr)
- return true;
- auto LatchBrExitIdx = LatchBr->getSuccessor(0) == L.getHeader() ? 1 : 0;
+ return std::nullopt;
+ auto LatchBrExitIdx = LatchBr->getSuccessor(0) == L.getHeader() ? 1 : 0;
BranchProbability ExitProbability =
BPI->getEdgeProbability(Latch, LatchBrExitIdx);
- if (ExitProbability > BranchProbability(1, MinRuntimeIterations)) {
- LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
- << "the exit probability is too big " << ExitProbability
- << "\n";);
- return false;
- }
- return true;
+ if (ExitProbability.isUnknown() || ExitProbability.isZero())
+ return std::nullopt;
+
+ return {ExitProbability.scaleByInverse(1)};
}
bool InductiveRangeCheckElimination::run(
@@ -989,8 +999,14 @@ bool InductiveRangeCheckElimination::run(
return false;
}
- if (!isProfitableToTransform(*L))
+ auto EstimatedTripCount = estimatedTripCount(*L);
+ if (!SkipProfitabilityChecks && EstimatedTripCount &&
+ *EstimatedTripCount < MinEliminatedChecks) {
+ LLVM_DEBUG(dbgs() << "irce: could not prove profitability: "
+ << "the estimated number of iterations is "
+ << *EstimatedTripCount << "\n");
return false;
+ }
LLVMContext &Context = Preheader->getContext();
SmallVector<InductiveRangeCheck, 16> RangeChecks;
@@ -998,8 +1014,8 @@ bool InductiveRangeCheckElimination::run(
for (auto *BBI : L->getBlocks())
if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
- InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI,
- RangeChecks, Changed);
+ InductiveRangeCheck::extractRangeChecksFromBranch(
+ TBI, L, SE, BPI, EstimatedTripCount, RangeChecks, Changed);
if (RangeChecks.empty())
return Changed;
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 16110cd25bc6..300a564e222e 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -591,7 +591,7 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
// 'getPredicateOnEdge' method. This would be able to handle value
// inequalities better, for example if the compare is "X < 4" and "X < 3"
// is known true but "X < 4" itself is not available.
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *Val;
Constant *Cst;
if (!PredCst && match(V, m_Cmp(Pred, m_Value(Val), m_Constant(Cst))))
@@ -2744,7 +2744,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
// Pred is a predecessor of BB with an unconditional branch to BB. SI is
// a Select instruction in Pred. BB has other predecessors and SI is used in
// a PHI node in BB. SI has no other use.
-// A new basic block, NewBB, is created and SI is converted to compare and
+// A new basic block, NewBB, is created and SI is converted to compare and
// conditional branch. SI is erased from parent.
void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
SelectInst *SI, PHINode *SIUse,
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 3ade32027289..a5d5eecb1ebf 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2430,8 +2430,8 @@ static bool hoistMinMax(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
} else
return false;
- auto MatchICmpAgainstInvariant = [&](Value *C, ICmpInst::Predicate &P,
- Value *&LHS, Value *&RHS) {
+ auto MatchICmpAgainstInvariant = [&](Value *C, CmpPredicate &P, Value *&LHS,
+ Value *&RHS) {
if (!match(C, m_OneUse(m_ICmp(P, m_Value(LHS), m_Value(RHS)))))
return false;
if (!LHS->getType()->isIntegerTy())
@@ -2448,12 +2448,13 @@ static bool hoistMinMax(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
P = ICmpInst::getInversePredicate(P);
return true;
};
- ICmpInst::Predicate P1, P2;
+ CmpPredicate P1, P2;
Value *LHS1, *LHS2, *RHS1, *RHS2;
if (!MatchICmpAgainstInvariant(Cond1, P1, LHS1, RHS1) ||
!MatchICmpAgainstInvariant(Cond2, P2, LHS2, RHS2))
return false;
- if (P1 != P2 || LHS1 != LHS2)
+ // FIXME: Use CmpPredicate::getMatching here.
+ if (P1 != static_cast<CmpInst::Predicate>(P2) || LHS1 != LHS2)
return false;
// Everything is fine, we can do the transform.
@@ -2678,7 +2679,7 @@ static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
MemorySSAUpdater &MSSAU, AssumptionCache *AC,
DominatorTree *DT) {
using namespace PatternMatch;
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *LHS, *RHS;
if (!match(&I, m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
return false;
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index ff077624802b..73f1942849ac 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -32,7 +32,7 @@ struct ConditionInfo {
/// ICmp instruction with this condition
ICmpInst *ICmp = nullptr;
/// Preciate info
- ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+ CmpPredicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
/// AddRec llvm value
Value *AddRecValue = nullptr;
/// Non PHI AddRec llvm value
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 05cf638d3f09..ba1c2241aea9 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -2432,7 +2432,7 @@ static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
// Step 1: Check if the loop backedge is in desirable form.
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *CmpLHS, *CmpRHS;
BasicBlock *TrueBB, *FalseBB;
if (!match(LoopHeaderBB->getTerminator(),
@@ -2797,7 +2797,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,
// Step 1: Check if the loop backedge, condition is in desirable form.
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
BasicBlock *TrueBB, *FalseBB;
if (!match(LoopHeaderBB->getTerminator(),
m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 29844c463075..796fba67ee25 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -924,8 +924,7 @@ public:
match(B, m_Intrinsic<Intrinsic::matrix_transpose>(
m_Value(BT), m_ConstantInt(), m_ConstantInt()))) {
IRBuilder<> Builder(&I);
- auto *Add = cast<Instruction>(Builder.CreateFAdd(AT, BT, "mfadd"));
- setShapeInfo(Add, {R, C});
+ auto *Add = Builder.CreateFAdd(AT, BT, "mfadd");
MatrixBuilder MBuilder(Builder);
Instruction *NewInst = MBuilder.CreateMatrixTranspose(
Add, R->getZExtValue(), C->getZExtValue(), "mfadd_t");
@@ -934,9 +933,13 @@ public:
computeShapeInfoForInst(&I, ShapeMap) &&
"Shape of new instruction doesn't match original shape.");
CleanupBinOp(I, A, B);
- assert(computeShapeInfoForInst(Add, ShapeMap).value_or(ShapeMap[Add]) ==
- ShapeMap[Add] &&
- "Shape of updated addition doesn't match cached shape.");
+ if (auto *AddI = dyn_cast<Instruction>(Add)) {
+ setShapeInfo(AddI, {R, C});
+ assert(
+ computeShapeInfoForInst(AddI, ShapeMap).value_or(ShapeMap[AddI]) ==
+ ShapeMap[AddI] &&
+ "Shape of updated addition doesn't match cached shape.");
+ }
}
}
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 0cba5d077da6..5f7cb92d239b 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -345,10 +345,14 @@ static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA,
static void combineAAMetadata(Instruction *ReplInst, Instruction *I) {
// FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
// handled here, but combineMetadata doesn't support them yet
- unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias,
- LLVMContext::MD_invariant_group,
- LLVMContext::MD_access_group};
+ unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_invariant_group,
+ LLVMContext::MD_access_group, LLVMContext::MD_prof,
+ LLVMContext::MD_memprof, LLVMContext::MD_callsite};
+ // FIXME: https://github.com/llvm/llvm-project/issues/121495
+ // Use custom AA metadata combining handling instead of combineMetadata, which
+ // is meant for CSE and will drop any metadata not in the KnownIDs list.
combineMetadata(ReplInst, I, KnownIDs, true);
}
@@ -787,43 +791,47 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
// Ensure that the value being stored is something that can be memset'able a
// byte at a time like "0" or "-1" or any width, as well as things like
// 0xA0A0A0A0 and 0.0.
- auto *V = SI->getOperand(0);
- if (Value *ByteVal = isBytewiseValue(V, DL)) {
- if (Instruction *I =
- tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) {
- BBI = I->getIterator(); // Don't invalidate iterator.
- return true;
- }
+ Value *V = SI->getOperand(0);
+ Value *ByteVal = isBytewiseValue(V, DL);
+ if (!ByteVal)
+ return false;
- // If we have an aggregate, we try to promote it to memset regardless
- // of opportunity for merging as it can expose optimization opportunities
- // in subsequent passes.
- auto *T = V->getType();
- if (T->isAggregateType()) {
- uint64_t Size = DL.getTypeStoreSize(T);
- IRBuilder<> Builder(SI);
- auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size,
- SI->getAlign());
- M->copyMetadata(*SI, LLVMContext::MD_DIAssignID);
+ if (Instruction *I =
+ tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) {
+ BBI = I->getIterator(); // Don't invalidate iterator.
+ return true;
+ }
+
+ // If we have an aggregate, we try to promote it to memset regardless
+ // of opportunity for merging as it can expose optimization opportunities
+ // in subsequent passes.
+ auto *T = V->getType();
+ if (!T->isAggregateType())
+ return false;
- LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+ TypeSize Size = DL.getTypeStoreSize(T);
+ if (Size.isScalable())
+ return false;
- // The newly inserted memset is immediately overwritten by the original
- // store, so we do not need to rename uses.
- auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI));
- auto *NewAccess = MSSAU->createMemoryAccessBefore(M, nullptr, StoreDef);
- MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false);
+ IRBuilder<> Builder(SI);
+ auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size,
+ SI->getAlign());
+ M->copyMetadata(*SI, LLVMContext::MD_DIAssignID);
- eraseInstruction(SI);
- NumMemSetInfer++;
+ LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
- // Make sure we do not invalidate the iterator.
- BBI = M->getIterator();
- return true;
- }
- }
+ // The newly inserted memset is immediately overwritten by the original
+ // store, so we do not need to rename uses.
+ auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI));
+ auto *NewAccess = MSSAU->createMemoryAccessBefore(M, nullptr, StoreDef);
+ MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false);
- return false;
+ eraseInstruction(SI);
+ NumMemSetInfer++;
+
+ // Make sure we do not invalidate the iterator.
+ BBI = M->getIterator();
+ return true;
}
bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index d80af26451ac..f6179cadab42 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -43,6 +43,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -83,6 +84,7 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
@@ -246,6 +248,7 @@ private:
bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
+ bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
void clobberUse(Use &U);
bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
@@ -598,6 +601,7 @@ public:
/// If this is true, the slices are never fully built and should be
/// ignored.
bool isEscaped() const { return PointerEscapingInstr; }
+ bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
/// Support for iterating over the slices.
/// @{
@@ -680,6 +684,7 @@ private:
/// store a pointer to that here and abort trying to form slices of the
/// alloca. This will be null if the alloca slices are analyzed successfully.
Instruction *PointerEscapingInstr;
+ Instruction *PointerEscapingInstrReadOnly;
/// The slices of the alloca.
///
@@ -1390,6 +1395,19 @@ private:
/// Disable SROA entirely if there are unhandled users of the alloca.
void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+
+ void visitCallBase(CallBase &CB) {
+ // If the call operand is NoCapture ReadOnly, then we mark it as
+ // EscapedReadOnly.
+ if (CB.isDataOperand(U) &&
+ CB.doesNotCapture(U->getOperandNo()) &&
+ CB.onlyReadsMemory(U->getOperandNo())) {
+ PI.setEscapedReadOnly(&CB);
+ return;
+ }
+
+ Base::visitCallBase(CB);
+ }
};
AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
@@ -1397,7 +1415,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
AI(AI),
#endif
- PointerEscapingInstr(nullptr) {
+ PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
SliceBuilder PB(DL, AI, *this);
SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
if (PtrI.isEscaped() || PtrI.isAborted()) {
@@ -1408,6 +1426,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
assert(PointerEscapingInstr && "Did not track a bad instruction");
return;
}
+ PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
@@ -1445,6 +1464,9 @@ void AllocaSlices::print(raw_ostream &OS) const {
return;
}
+ if (PointerEscapingInstrReadOnly)
+ OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
+
OS << "Slices of alloca: " << AI << "\n";
for (const_iterator I = begin(), E = end(); I != E; ++I)
print(OS, I);
@@ -5454,6 +5476,88 @@ void SROA::clobberUse(Use &U) {
}
}
+/// A basic LoadAndStorePromoter that does not remove store nodes.
+class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
+public:
+ BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
+ Type *ZeroType)
+ : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
+ bool shouldDelete(Instruction *I) const override {
+ return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
+ }
+
+ Value *getValueToUseForAlloca(Instruction *I) const override {
+ return UndefValue::get(ZeroType);
+ }
+
+private:
+ Type *ZeroType;
+};
+
+bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
+ // Look through each "partition", looking for slices with the same start/end
+ // that do not overlap with any before them. The slices are sorted by
+ // increasing beginOffset. We don't use AS.partitions(), as it will use a more
+ // sophisticated algorithm that takes splittable slices into account.
+ auto PartitionBegin = AS.begin();
+ auto PartitionEnd = PartitionBegin;
+ uint64_t BeginOffset = PartitionBegin->beginOffset();
+ uint64_t EndOffset = PartitionBegin->endOffset();
+ while (PartitionBegin != AS.end()) {
+ bool AllSameAndValid = true;
+ SmallVector<Instruction *> Insts;
+ Type *PartitionType = nullptr;
+ while (PartitionEnd != AS.end() &&
+ (PartitionEnd->beginOffset() < EndOffset ||
+ PartitionEnd->endOffset() <= EndOffset)) {
+ if (AllSameAndValid) {
+ AllSameAndValid &= PartitionEnd->beginOffset() == BeginOffset &&
+ PartitionEnd->endOffset() == EndOffset;
+ Instruction *User =
+ cast<Instruction>(PartitionEnd->getUse()->getUser());
+ if (auto *LI = dyn_cast<LoadInst>(User)) {
+ Type *UserTy = LI->getType();
+ // LoadAndStorePromoter requires all the types to be the same.
+ if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
+ AllSameAndValid = false;
+ PartitionType = UserTy;
+ Insts.push_back(User);
+ } else if (auto *SI = dyn_cast<StoreInst>(User)) {
+ Type *UserTy = SI->getValueOperand()->getType();
+ if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
+ AllSameAndValid = false;
+ PartitionType = UserTy;
+ Insts.push_back(User);
+ } else if (!isAssumeLikeIntrinsic(User)) {
+ AllSameAndValid = false;
+ }
+ }
+ EndOffset = std::max(EndOffset, PartitionEnd->endOffset());
+ ++PartitionEnd;
+ }
+
+ // So long as all the slices start and end offsets matched, update loads to
+ // the values stored in the partition.
+ if (AllSameAndValid && !Insts.empty()) {
+ LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
+ << EndOffset << ")\n");
+ SmallVector<PHINode *, 4> NewPHIs;
+ SSAUpdater SSA(&NewPHIs);
+ Insts.push_back(&AI);
+ BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
+ Promoter.run(Insts);
+ }
+
+ // Step on to the next partition.
+ PartitionBegin = PartitionEnd;
+ if (PartitionBegin == AS.end())
+ break;
+ BeginOffset = PartitionBegin->beginOffset();
+ EndOffset = PartitionBegin->endOffset();
+ }
+ return true;
+}
+
/// Analyze an alloca for SROA.
///
/// This analyzes the alloca to ensure we can reason about it, builds
@@ -5494,6 +5598,11 @@ SROA::runOnAlloca(AllocaInst &AI) {
if (AS.isEscaped())
return {Changed, CFGChanged};
+ if (AS.isEscapedReadOnly()) {
+ Changed |= propagateStoredValuesToLoads(AI, AS);
+ return {Changed, CFGChanged};
+ }
+
// Delete all the dead users of this alloca before splitting and rewriting it.
for (Instruction *DeadUser : AS.getDeadUsers()) {
// Free up everything used by this instruction.
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 3b701e6ca097..2b27150112ad 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -279,8 +279,6 @@ public:
bool visit(Function &F);
- bool isTriviallyScalarizable(Intrinsic::ID ID);
-
// InstVisitor methods. They return true if the instruction was scalarized,
// false if nothing changed.
bool visitInstruction(Instruction &I) { return false; }
@@ -683,19 +681,6 @@ bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) {
return true;
}
-bool ScalarizerVisitor::isTriviallyScalarizable(Intrinsic::ID ID) {
- if (isTriviallyVectorizable(ID))
- return true;
- // TODO: Move frexp to isTriviallyVectorizable.
- // https://github.com/llvm/llvm-project/issues/112408
- switch (ID) {
- case Intrinsic::frexp:
- return true;
- }
- return Intrinsic::isTargetIntrinsic(ID) &&
- TTI->isTargetIntrinsicTriviallyScalarizable(ID);
-}
-
/// If a call to a vector typed intrinsic function, split into a scalar call per
/// element if possible for the intrinsic.
bool ScalarizerVisitor::splitCall(CallInst &CI) {
@@ -715,7 +700,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
Intrinsic::ID ID = F->getIntrinsicID();
- if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID))
+ if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID, TTI))
return false;
// unsigned NumElems = VT->getNumElements();
@@ -743,7 +728,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
// will only scalarize when the struct elements have the same bitness.
if (!CurrVS || CurrVS->NumPacked != VS->NumPacked)
return false;
- if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, I))
+ if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, I, TTI))
Tys.push_back(CurrVS->SplitTy);
}
}
@@ -794,8 +779,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
Tys[0] = VS->RemainderTy;
for (unsigned J = 0; J != NumArgs; ++J) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, J) ||
- TTI->isTargetIntrinsicWithScalarOpAtArg(ID, J)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) {
ScalarCallOps.push_back(ScalarOperands[J]);
} else {
ScalarCallOps.push_back(Scattered[J][I]);
@@ -1089,7 +1073,7 @@ bool ScalarizerVisitor::visitExtractValueInst(ExtractValueInst &EVI) {
if (!F)
return false;
Intrinsic::ID ID = F->getIntrinsicID();
- if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID))
+ if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID, TTI))
return false;
// Note: Fall through means Operand is a`CallInst` and it is defined in
// `isTriviallyScalarizable`.
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index d8ef450eeb9a..0712ff77151e 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2990,9 +2990,11 @@ static bool collectUnswitchCandidates(
/// into its equivalent where `Pred` is something that we support for injected
/// invariants (so far it is limited to ult), LHS in canonicalized form is
/// non-invariant and RHS is an invariant.
-static void canonicalizeForInvariantConditionInjection(
- ICmpInst::Predicate &Pred, Value *&LHS, Value *&RHS, BasicBlock *&IfTrue,
- BasicBlock *&IfFalse, const Loop &L) {
+static void canonicalizeForInvariantConditionInjection(CmpPredicate &Pred,
+ Value *&LHS, Value *&RHS,
+ BasicBlock *&IfTrue,
+ BasicBlock *&IfFalse,
+ const Loop &L) {
if (!L.contains(IfTrue)) {
Pred = ICmpInst::getInversePredicate(Pred);
std::swap(IfTrue, IfFalse);
@@ -3235,7 +3237,7 @@ static bool collectUnswitchCandidatesWithInjections(
// other).
for (auto *DTN = DT.getNode(Latch); L.contains(DTN->getBlock());
DTN = DTN->getIDom()) {
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Value *LHS = nullptr, *RHS = nullptr;
BasicBlock *IfTrue = nullptr, *IfFalse = nullptr;
auto *BB = DTN->getBlock();
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 75585fcc8026..7d017095c88c 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -78,6 +78,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -93,6 +94,9 @@ using namespace PatternMatch;
static const unsigned UnknownAddressSpace =
std::numeric_limits<unsigned>::max();
+DEBUG_COUNTER(StraightLineStrengthReduceCounter, "slsr-counter",
+ "Controls whether rewriteCandidateWithBasis is executed.");
+
namespace {
class StraightLineStrengthReduceLegacyPass : public FunctionPass {
@@ -268,8 +272,8 @@ FunctionPass *llvm::createStraightLineStrengthReducePass() {
bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
const Candidate &C) {
return (Basis.Ins != C.Ins && // skip the same instruction
- // They must have the same type too. Basis.Base == C.Base doesn't
- // guarantee their types are the same (PR23975).
+ // They must have the same type too. Basis.Base == C.Base
+ // doesn't guarantee their types are the same (PR23975).
Basis.Ins->getType() == C.Ins->getType() &&
// Basis must dominate C in order to rewrite C with respect to Basis.
DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
@@ -610,6 +614,9 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
void StraightLineStrengthReduce::rewriteCandidateWithBasis(
const Candidate &C, const Candidate &Basis) {
+ if (!DebugCounter::shouldExecute(StraightLineStrengthReduceCounter))
+ return;
+
assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
C.Stride == Basis.Stride);
// We run rewriteCandidateWithBasis on all candidates in a post-order, so the
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 01090b54e5af..b1f742b838f2 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -686,8 +686,8 @@ void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
/// Add a dummy PHI value as soon as we knew the new predecessor
void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
for (PHINode &Phi : To->phis()) {
- Value *Undef = UndefValue::get(Phi.getType());
- Phi.addIncoming(Undef, From);
+ Value *Poison = PoisonValue::get(Phi.getType());
+ Phi.addIncoming(Poison, From);
}
AddedPhis[To].push_back(From);
}
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index e4f4052e5e48..fe1b91267c90 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1912,8 +1912,8 @@ Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
Type *IntTy = getIntTy(B, TLI);
StringRef PutsName = TLI->getName(LibFunc_puts);
- FunctionCallee PutS = getOrInsertLibFunc(M, *TLI, LibFunc_puts, IntTy,
- B.getPtrTy());
+ FunctionCallee PutS =
+ getOrInsertLibFunc(M, *TLI, LibFunc_puts, IntTy, B.getPtrTy());
inferNonMandatoryLibFuncAttrs(M, PutsName, *TLI);
CallInst *CI = B.CreateCall(PutS, Str, PutsName);
if (const Function *F =
@@ -1970,9 +1970,9 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
Type *SizeTTy = getSizeTTy(B, TLI);
StringRef FWriteName = TLI->getName(LibFunc_fwrite);
- FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fwrite,
- SizeTTy, B.getPtrTy(), SizeTTy,
- SizeTTy, File->getType());
+ FunctionCallee F =
+ getOrInsertLibFunc(M, *TLI, LibFunc_fwrite, SizeTTy, B.getPtrTy(),
+ SizeTTy, SizeTTy, File->getType());
if (File->getType()->isPointerTy())
inferNonMandatoryLibFuncAttrs(M, FWriteName, *TLI);
@@ -1994,8 +1994,8 @@ Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
StringRef MallocName = TLI->getName(LibFunc_malloc);
Type *SizeTTy = getSizeTTy(B, TLI);
- FunctionCallee Malloc = getOrInsertLibFunc(M, *TLI, LibFunc_malloc,
- B.getPtrTy(), SizeTTy);
+ FunctionCallee Malloc =
+ getOrInsertLibFunc(M, *TLI, LibFunc_malloc, B.getPtrTy(), SizeTTy);
inferNonMandatoryLibFuncAttrs(M, MallocName, *TLI);
CallInst *CI = B.CreateCall(Malloc, Num, MallocName);
@@ -2084,8 +2084,8 @@ Value *llvm::emitHotColdNew(Value *Num, IRBuilderBase &B,
return nullptr;
StringRef Name = TLI->getName(NewFunc);
- FunctionCallee Func = M->getOrInsertFunction(Name, B.getPtrTy(),
- Num->getType(), B.getInt8Ty());
+ FunctionCallee Func =
+ M->getOrInsertFunction(Name, B.getPtrTy(), Num->getType(), B.getInt8Ty());
inferNonMandatoryLibFuncAttrs(M, Name, *TLI);
CallInst *CI = B.CreateCall(Func, {Num, B.getInt8(HotCold)}, Name);
@@ -2104,9 +2104,8 @@ Value *llvm::emitHotColdNewNoThrow(Value *Num, Value *NoThrow, IRBuilderBase &B,
return nullptr;
StringRef Name = TLI->getName(NewFunc);
- FunctionCallee Func =
- M->getOrInsertFunction(Name, B.getPtrTy(), Num->getType(),
- NoThrow->getType(), B.getInt8Ty());
+ FunctionCallee Func = M->getOrInsertFunction(
+ Name, B.getPtrTy(), Num->getType(), NoThrow->getType(), B.getInt8Ty());
inferNonMandatoryLibFuncAttrs(M, Name, *TLI);
CallInst *CI = B.CreateCall(Func, {Num, NoThrow, B.getInt8(HotCold)}, Name);
@@ -2147,8 +2146,8 @@ Value *llvm::emitHotColdNewAlignedNoThrow(Value *Num, Value *Align,
StringRef Name = TLI->getName(NewFunc);
FunctionCallee Func = M->getOrInsertFunction(
- Name, B.getPtrTy(), Num->getType(), Align->getType(),
- NoThrow->getType(), B.getInt8Ty());
+ Name, B.getPtrTy(), Num->getType(), Align->getType(), NoThrow->getType(),
+ B.getInt8Ty());
inferNonMandatoryLibFuncAttrs(M, Name, *TLI);
CallInst *CI =
B.CreateCall(Func, {Num, Align, NoThrow, B.getInt8(HotCold)}, Name);
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 17cba2e642a1..725a0eb97eae 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -692,14 +692,14 @@ bool llvm::tryPromoteCall(CallBase &CB) {
if (!VTableEntryLoad)
return false; // Not a vtable entry load.
Value *VTableEntryPtr = VTableEntryLoad->getPointerOperand();
- APInt VTableOffset(DL.getTypeSizeInBits(VTableEntryPtr->getType()), 0);
+ APInt VTableOffset(DL.getIndexTypeSizeInBits(VTableEntryPtr->getType()), 0);
Value *VTableBasePtr = VTableEntryPtr->stripAndAccumulateConstantOffsets(
DL, VTableOffset, /* AllowNonInbounds */ true);
LoadInst *VTablePtrLoad = dyn_cast<LoadInst>(VTableBasePtr);
if (!VTablePtrLoad)
return false; // Not a vtable load.
Value *Object = VTablePtrLoad->getPointerOperand();
- APInt ObjectOffset(DL.getTypeSizeInBits(Object->getType()), 0);
+ APInt ObjectOffset(DL.getIndexTypeSizeInBits(Object->getType()), 0);
Value *ObjectBase = Object->stripAndAccumulateConstantOffsets(
DL, ObjectOffset, /* AllowNonInbounds */ true);
if (!(isa<AllocaInst>(ObjectBase) && ObjectOffset == 0))
@@ -710,9 +710,9 @@ bool llvm::tryPromoteCall(CallBase &CB) {
BasicBlock::iterator BBI(VTablePtrLoad);
Value *VTablePtr = FindAvailableLoadedValue(
VTablePtrLoad, VTablePtrLoad->getParent(), BBI, 0, nullptr, nullptr);
- if (!VTablePtr)
+ if (!VTablePtr || !VTablePtr->getType()->isPointerTy())
return false; // No vtable found.
- APInt VTableOffsetGVBase(DL.getTypeSizeInBits(VTablePtr->getType()), 0);
+ APInt VTableOffsetGVBase(DL.getIndexTypeSizeInBits(VTablePtr->getType()), 0);
Value *VTableGVBase = VTablePtr->stripAndAccumulateConstantOffsets(
DL, VTableOffsetGVBase, /* AllowNonInbounds */ true);
GlobalVariable *GV = dyn_cast<GlobalVariable>(VTableGVBase);
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index cb6a4e34c226..8863dff4482a 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -43,21 +43,16 @@ using namespace llvm;
/// See comments in Cloning.h.
BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
const Twine &NameSuffix, Function *F,
- ClonedCodeInfo *CodeInfo,
- DebugInfoFinder *DIFinder) {
+ ClonedCodeInfo *CodeInfo) {
BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
NewBB->IsNewDbgInfoFormat = BB->IsNewDbgInfoFormat;
if (BB->hasName())
NewBB->setName(BB->getName() + NameSuffix);
bool hasCalls = false, hasDynamicAllocas = false, hasMemProfMetadata = false;
- Module *TheModule = F ? F->getParent() : nullptr;
// Loop over all instructions, and copy them over.
for (const Instruction &I : *BB) {
- if (DIFinder && TheModule)
- DIFinder->processInstruction(*TheModule, I);
-
Instruction *NewInst = I.clone();
if (I.hasName())
NewInst->setName(I.getName() + NameSuffix);
@@ -157,6 +152,118 @@ DISubprogram *llvm::CollectDebugInfoForCloning(const Function &F,
return SPClonedWithinModule;
}
+bool llvm::BuildDebugInfoMDMap(DenseMap<const Metadata *, TrackingMDRef> &MD,
+ CloneFunctionChangeType Changes,
+ DebugInfoFinder &DIFinder,
+ DISubprogram *SPClonedWithinModule) {
+ bool ModuleLevelChanges = Changes > CloneFunctionChangeType::LocalChangesOnly;
+ if (Changes < CloneFunctionChangeType::DifferentModule &&
+ DIFinder.subprogram_count() > 0) {
+ // Turn on module-level changes, since we need to clone (some of) the
+ // debug info metadata.
+ //
+ // FIXME: Metadata effectively owned by a function should be made
+ // local, and only that local metadata should be cloned.
+ ModuleLevelChanges = true;
+
+ auto mapToSelfIfNew = [&MD](MDNode *N) {
+ // Avoid clobbering an existing mapping.
+ (void)MD.try_emplace(N, N);
+ };
+
+ // Avoid cloning types, compile units, and (other) subprograms.
+ for (DISubprogram *ISP : DIFinder.subprograms()) {
+ if (ISP != SPClonedWithinModule)
+ mapToSelfIfNew(ISP);
+ }
+
+ // If a subprogram isn't going to be cloned skip its lexical blocks as well.
+ for (DIScope *S : DIFinder.scopes()) {
+ auto *LScope = dyn_cast<DILocalScope>(S);
+ if (LScope && LScope->getSubprogram() != SPClonedWithinModule)
+ mapToSelfIfNew(S);
+ }
+
+ for (DICompileUnit *CU : DIFinder.compile_units())
+ mapToSelfIfNew(CU);
+
+ for (DIType *Type : DIFinder.types())
+ mapToSelfIfNew(Type);
+ } else {
+ assert(!SPClonedWithinModule &&
+ "Subprogram should be in DIFinder->subprogram_count()...");
+ }
+
+ return ModuleLevelChanges;
+}
+
+void llvm::CloneFunctionMetadataInto(Function &NewFunc, const Function &OldFunc,
+ ValueToValueMapTy &VMap,
+ RemapFlags RemapFlag,
+ ValueMapTypeRemapper *TypeMapper,
+ ValueMaterializer *Materializer) {
+ SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+ OldFunc.getAllMetadata(MDs);
+ for (auto MD : MDs) {
+ NewFunc.addMetadata(MD.first, *MapMetadata(MD.second, VMap, RemapFlag,
+ TypeMapper, Materializer));
+ }
+}
+
+void llvm::CloneFunctionBodyInto(Function &NewFunc, const Function &OldFunc,
+ ValueToValueMapTy &VMap, RemapFlags RemapFlag,
+ SmallVectorImpl<ReturnInst *> &Returns,
+ const char *NameSuffix,
+ ClonedCodeInfo *CodeInfo,
+ ValueMapTypeRemapper *TypeMapper,
+ ValueMaterializer *Materializer) {
+ if (OldFunc.isDeclaration())
+ return;
+
+ // Loop over all of the basic blocks in the function, cloning them as
+ // appropriate. Note that we save BE this way in order to handle cloning of
+ // recursive functions into themselves.
+ for (const BasicBlock &BB : OldFunc) {
+
+ // Create a new basic block and copy instructions into it!
+ BasicBlock *CBB =
+ CloneBasicBlock(&BB, VMap, NameSuffix, &NewFunc, CodeInfo);
+
+ // Add basic block mapping.
+ VMap[&BB] = CBB;
+
+ // It is only legal to clone a function if a block address within that
+ // function is never referenced outside of the function. Given that, we
+ // want to map block addresses from the old function to block addresses in
+ // the clone. (This is different from the generic ValueMapper
+ // implementation, which generates an invalid blockaddress when
+ // cloning a function.)
+ if (BB.hasAddressTaken()) {
+ Constant *OldBBAddr = BlockAddress::get(const_cast<Function *>(&OldFunc),
+ const_cast<BasicBlock *>(&BB));
+ VMap[OldBBAddr] = BlockAddress::get(&NewFunc, CBB);
+ }
+
+ // Note return instructions for the caller.
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
+ Returns.push_back(RI);
+ }
+
+ // Loop over all of the instructions in the new function, fixing up operand
+ // references as we go. This uses VMap to do all the hard work.
+ for (Function::iterator
+ BB = cast<BasicBlock>(VMap[&OldFunc.front()])->getIterator(),
+ BE = NewFunc.end();
+ BB != BE; ++BB)
+ // Loop over all instructions, fixing each one as we find it, and any
+ // attached debug-info records.
+ for (Instruction &II : *BB) {
+ RemapInstruction(&II, VMap, RemapFlag, TypeMapper, Materializer);
+ RemapDbgRecordRange(II.getModule(), II.getDbgRecordRange(), VMap,
+ RemapFlag, TypeMapper, Materializer);
+ }
+}
+
// Clone OldFunc into NewFunc, transforming the old arguments into references to
// VMap values.
void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
@@ -215,101 +322,16 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
DISubprogram *SPClonedWithinModule =
CollectDebugInfoForCloning(*OldFunc, Changes, DIFinder);
- // Loop over all of the basic blocks in the function, cloning them as
- // appropriate. Note that we save BE this way in order to handle cloning of
- // recursive functions into themselves.
- for (const BasicBlock &BB : *OldFunc) {
-
- // Create a new basic block and copy instructions into it!
- // NOTE: don't pass DIFinder because instructions' debug info was processed
- // in ProcessSubprogramAttachment. This will be cleaned up further.
- BasicBlock *CBB =
- CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo, nullptr);
-
- // Add basic block mapping.
- VMap[&BB] = CBB;
-
- // It is only legal to clone a function if a block address within that
- // function is never referenced outside of the function. Given that, we
- // want to map block addresses from the old function to block addresses in
- // the clone. (This is different from the generic ValueMapper
- // implementation, which generates an invalid blockaddress when
- // cloning a function.)
- if (BB.hasAddressTaken()) {
- Constant *OldBBAddr = BlockAddress::get(const_cast<Function *>(OldFunc),
- const_cast<BasicBlock *>(&BB));
- VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
- }
-
- // Note return instructions for the caller.
- if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
- Returns.push_back(RI);
- }
-
- if (Changes < CloneFunctionChangeType::DifferentModule &&
- DIFinder.subprogram_count() > 0) {
- // Turn on module-level changes, since we need to clone (some of) the
- // debug info metadata.
- //
- // FIXME: Metadata effectively owned by a function should be made
- // local, and only that local metadata should be cloned.
- ModuleLevelChanges = true;
-
- auto mapToSelfIfNew = [&VMap](MDNode *N) {
- // Avoid clobbering an existing mapping.
- (void)VMap.MD().try_emplace(N, N);
- };
-
- // Avoid cloning types, compile units, and (other) subprograms.
- SmallPtrSet<const DISubprogram *, 16> MappedToSelfSPs;
- for (DISubprogram *ISP : DIFinder.subprograms()) {
- if (ISP != SPClonedWithinModule) {
- mapToSelfIfNew(ISP);
- MappedToSelfSPs.insert(ISP);
- }
- }
-
- // If a subprogram isn't going to be cloned skip its lexical blocks as well.
- for (DIScope *S : DIFinder.scopes()) {
- auto *LScope = dyn_cast<DILocalScope>(S);
- if (LScope && MappedToSelfSPs.count(LScope->getSubprogram()))
- mapToSelfIfNew(S);
- }
-
- for (DICompileUnit *CU : DIFinder.compile_units())
- mapToSelfIfNew(CU);
-
- for (DIType *Type : DIFinder.types())
- mapToSelfIfNew(Type);
- } else {
- assert(!SPClonedWithinModule &&
- "Subprogram should be in DIFinder->subprogram_count()...");
- }
+ ModuleLevelChanges =
+ BuildDebugInfoMDMap(VMap.MD(), Changes, DIFinder, SPClonedWithinModule);
const auto RemapFlag = ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges;
- // Duplicate the metadata that is attached to the cloned function.
- // Subprograms/CUs/types that were already mapped to themselves won't be
- // duplicated.
- SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
- OldFunc->getAllMetadata(MDs);
- for (auto MD : MDs) {
- NewFunc->addMetadata(MD.first, *MapMetadata(MD.second, VMap, RemapFlag,
- TypeMapper, Materializer));
- }
- // Loop over all of the instructions in the new function, fixing up operand
- // references as we go. This uses VMap to do all the hard work.
- for (Function::iterator
- BB = cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(),
- BE = NewFunc->end();
- BB != BE; ++BB)
- // Loop over all instructions, fixing each one as we find it, and any
- // attached debug-info records.
- for (Instruction &II : *BB) {
- RemapInstruction(&II, VMap, RemapFlag, TypeMapper, Materializer);
- RemapDbgRecordRange(II.getModule(), II.getDbgRecordRange(), VMap,
- RemapFlag, TypeMapper, Materializer);
- }
+ CloneFunctionMetadataInto(*NewFunc, *OldFunc, VMap, RemapFlag, TypeMapper,
+ Materializer);
+
+ CloneFunctionBodyInto(*NewFunc, *OldFunc, VMap, RemapFlag, Returns,
+ NameSuffix, CodeInfo, TypeMapper, Materializer);
// Only update !llvm.dbg.cu for DifferentModule (not CloneModule). In the
// same module, the compile unit will already be listed (or not). When
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 6539f924c2ed..7ddb9e22c834 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -627,6 +627,24 @@ bool CodeExtractor::isEligible() const {
return false;
}
}
+ // stacksave as input implies stackrestore in the outlined function.
+ // This can confuse prolog epilog insertion phase.
+ // stacksave's uses must not cross outlined function.
+ for (BasicBlock *BB : Blocks) {
+ for (Instruction &I : *BB) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+ if (!II)
+ continue;
+ bool IsSave = II->getIntrinsicID() == Intrinsic::stacksave;
+ bool IsRestore = II->getIntrinsicID() == Intrinsic::stackrestore;
+ if (IsSave && any_of(II->users(), [&Blks = this->Blocks](User *U) {
+ return !definedInRegion(Blks, U);
+ }))
+ return false;
+ if (IsRestore && !definedInRegion(Blocks, II->getArgOperand(0)))
+ return false;
+ }
+ }
return true;
}
@@ -935,6 +953,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
case Attribute::SanitizeMemory:
case Attribute::SanitizeNumericalStability:
case Attribute::SanitizeThread:
+ case Attribute::SanitizeType:
case Attribute::SanitizeHWAddress:
case Attribute::SanitizeMemTag:
case Attribute::SanitizeRealtime:
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 47bb31905d1a..5b33edd51cff 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -48,6 +48,21 @@ static void insertCall(Function &CurFn, StringRef Func,
/*isVarArg=*/false)),
{GV}, "", InsertionPt);
Call->setDebugLoc(DL);
+ } else if (TargetTriple.isRISCV() || TargetTriple.isAArch64() ||
+ TargetTriple.isLoongArch()) {
+ // On RISC-V, AArch64, and LoongArch, the `_mcount` function takes
+ // `__builtin_return_address(0)` as an argument since
+ // `__builtin_return_address(1)` is not available on these platforms.
+ Instruction *RetAddr = CallInst::Create(
+ Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress),
+ ConstantInt::get(Type::getInt32Ty(C), 0), "", InsertionPt);
+ RetAddr->setDebugLoc(DL);
+
+ FunctionCallee Fn = M.getOrInsertFunction(
+ Func, FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C),
+ false));
+ CallInst *Call = CallInst::Create(Fn, RetAddr, "", InsertionPt);
+ Call->setDebugLoc(DL);
} else {
FunctionCallee Fn = M.getOrInsertFunction(Func, Type::getVoidTy(C));
CallInst *Call = CallInst::Create(Fn, "", InsertionPt);
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index cf1a8b4af112..2af447aadce2 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -253,40 +253,17 @@ Evaluator::getCalleeWithFormalArgs(CallBase &CB,
bool Evaluator::getFormalParams(CallBase &CB, Function *F,
SmallVectorImpl<Constant *> &Formals) {
- if (!F)
- return false;
-
auto *FTy = F->getFunctionType();
- if (FTy->getNumParams() > CB.arg_size()) {
- LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
+ if (FTy != CB.getFunctionType()) {
+ LLVM_DEBUG(dbgs() << "Signature mismatch.\n");
return false;
}
- auto ArgI = CB.arg_begin();
- for (Type *PTy : FTy->params()) {
- auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), PTy, DL);
- if (!ArgC) {
- LLVM_DEBUG(dbgs() << "Can not convert function argument.\n");
- return false;
- }
- Formals.push_back(ArgC);
- ++ArgI;
- }
+ for (Value *Arg : CB.args())
+ Formals.push_back(getVal(Arg));
return true;
}
-/// If call expression contains bitcast then we may need to cast
-/// evaluated return value to a type of the call expression.
-Constant *Evaluator::castCallResultIfNeeded(Type *ReturnType, Constant *RV) {
- if (!RV || RV->getType() == ReturnType)
- return RV;
-
- RV = ConstantFoldLoadThroughBitcast(RV, ReturnType, DL);
- if (!RV)
- LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
- return RV;
-}
-
/// Evaluate all instructions in block BB, returning true if successful, false
/// if we can't evaluate it. NewBB returns the next BB that control flows into,
/// or null upon return. StrippedPointerCastsForAliasAnalysis is set to true if
@@ -520,9 +497,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
if (Callee->isDeclaration()) {
// If this is a function we can constant fold, do it.
if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) {
- InstResult = castCallResultIfNeeded(CB.getType(), C);
- if (!InstResult)
- return false;
+ InstResult = C;
LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
<< *InstResult << "\n");
} else {
@@ -544,10 +519,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
return false;
}
ValueStack.pop_back();
- InstResult = castCallResultIfNeeded(CB.getType(), RetVal);
- if (RetVal && !InstResult)
- return false;
-
+ InstResult = RetVal;
if (InstResult) {
LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: "
<< *InstResult << "\n\n");
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 760341a29d8c..6d4026e8209d 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -83,6 +83,13 @@ int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
return 0;
}
+int FunctionComparator::cmpConstantRanges(const ConstantRange &L,
+ const ConstantRange &R) const {
+ if (int Res = cmpAPInts(L.getLower(), R.getLower()))
+ return Res;
+ return cmpAPInts(L.getUpper(), R.getUpper());
+}
+
int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
// Floats are ordered first by semantics (i.e. float, double, half, etc.),
// then by value interpreted as a bitstring (aka APInt).
@@ -147,12 +154,22 @@ int FunctionComparator::cmpAttrs(const AttributeList L,
if (LA.getKindAsEnum() != RA.getKindAsEnum())
return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum());
- const ConstantRange &LCR = LA.getRange();
- const ConstantRange &RCR = RA.getRange();
- if (int Res = cmpAPInts(LCR.getLower(), RCR.getLower()))
+ if (int Res = cmpConstantRanges(LA.getRange(), RA.getRange()))
return Res;
- if (int Res = cmpAPInts(LCR.getUpper(), RCR.getUpper()))
+ continue;
+ } else if (LA.isConstantRangeListAttribute() &&
+ RA.isConstantRangeListAttribute()) {
+ if (LA.getKindAsEnum() != RA.getKindAsEnum())
+ return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum());
+
+ ArrayRef<ConstantRange> CRL = LA.getValueAsConstantRangeList();
+ ArrayRef<ConstantRange> CRR = RA.getValueAsConstantRangeList();
+ if (int Res = cmpNumbers(CRL.size(), CRR.size()))
return Res;
+
+ for (const auto &[L, R] : zip(CRL, CRR))
+ if (int Res = cmpConstantRanges(L, R))
+ return Res;
continue;
}
if (LA < RA)
@@ -441,9 +458,7 @@ int FunctionComparator::cmpConstants(const Constant *L,
if (InRangeL) {
if (!InRangeR)
return 1;
- if (int Res = cmpAPInts(InRangeL->getLower(), InRangeR->getLower()))
- return Res;
- if (int Res = cmpAPInts(InRangeL->getUpper(), InRangeR->getUpper()))
+ if (int Res = cmpConstantRanges(*InRangeL, *InRangeR))
return Res;
} else if (InRangeR) {
return -1;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index cdc3f0308fe5..1e4061cb0771 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1279,10 +1279,10 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
// | for.body <---- (md2)
// |_______| |______|
if (Instruction *TI = BB->getTerminator())
- if (TI->hasMetadata(LLVMContext::MD_loop))
+ if (TI->hasNonDebugLocLoopMetadata())
for (BasicBlock *Pred : predecessors(BB))
if (Instruction *PredTI = Pred->getTerminator())
- if (PredTI->hasMetadata(LLVMContext::MD_loop))
+ if (PredTI->hasNonDebugLocLoopMetadata())
return false;
if (BBKillable)
@@ -1345,12 +1345,15 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
}
}
- // If the unconditional branch we replaced contains llvm.loop metadata, we
- // add the metadata to the branch instructions in the predecessors.
+ // If the unconditional branch we replaced contains non-debug llvm.loop
+ // metadata, we add the metadata to the branch instructions in the
+ // predecessors.
if (Instruction *TI = BB->getTerminator())
- if (MDNode *LoopMD = TI->getMetadata(LLVMContext::MD_loop))
+ if (TI->hasNonDebugLocLoopMetadata()) {
+ MDNode *LoopMD = TI->getMetadata(LLVMContext::MD_loop);
for (BasicBlock *Pred : predecessors(BB))
Pred->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopMD);
+ }
if (BBKillable) {
// Everything that jumped to BB now goes to Succ.
@@ -3305,6 +3308,9 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
return Changed;
}
+// FIXME: https://github.com/llvm/llvm-project/issues/121495
+// Once external callers of this function are removed, either inline into
+// combineMetadataForCSE, or internalize and remove KnownIDs parameter.
void llvm::combineMetadata(Instruction *K, const Instruction *J,
ArrayRef<unsigned> KnownIDs, bool DoesKMove) {
SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
@@ -3317,6 +3323,10 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
switch (Kind) {
default:
+ // FIXME: https://github.com/llvm/llvm-project/issues/121495
+ // Change to removing only explicitly listed other metadata, and assert
+ // on unknown metadata, to avoid inadvertently dropping newly added
+ // metadata types.
K->setMetadata(Kind, nullptr); // Remove unknown metadata
break;
case LLVMContext::MD_dbg:
@@ -3376,6 +3386,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
K->setMetadata(Kind,
MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
break;
+ case LLVMContext::MD_memprof:
+ K->setMetadata(Kind, MDNode::getMergedMemProfMetadata(KMD, JMD));
+ break;
+ case LLVMContext::MD_callsite:
+ K->setMetadata(Kind, MDNode::getMergedCallsiteMetadata(KMD, JMD));
+ break;
case LLVMContext::MD_preserve_access_index:
// Preserve !preserve.access.index in K.
break;
@@ -3439,7 +3455,9 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J,
LLVMContext::MD_nontemporal,
LLVMContext::MD_noundef,
LLVMContext::MD_mmra,
- LLVMContext::MD_noalias_addrspace};
+ LLVMContext::MD_noalias_addrspace,
+ LLVMContext::MD_memprof,
+ LLVMContext::MD_callsite};
combineMetadata(K, J, KnownIDs, KDominatesJ);
}
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 3cbde39b30b4..9a24c1b0d03d 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -378,7 +378,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
return;
}
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal))))
return;
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 04042e71a2b8..fffff295ba92 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -171,14 +171,14 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
// Users in the OrigPreHeader need to use the value to which the
// original definitions are mapped and anything else can be handled by
// the SSAUpdater. To avoid adding PHINodes, check if the value is
- // available in UserBB, if not substitute undef.
+ // available in UserBB, if not substitute poison.
Value *NewVal;
if (UserBB == OrigPreheader)
NewVal = OrigPreHeaderVal;
else if (SSA.HasValueForBlock(UserBB))
NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
else
- NewVal = UndefValue::get(OrigHeaderVal->getType());
+ NewVal = PoisonValue::get(OrigHeaderVal->getType());
DbgValue->replaceVariableLocationOp(OrigHeaderVal, NewVal);
}
@@ -194,14 +194,14 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
// Users in the OrigPreHeader need to use the value to which the
// original definitions are mapped and anything else can be handled by
// the SSAUpdater. To avoid adding PHINodes, check if the value is
- // available in UserBB, if not substitute undef.
+ // available in UserBB, if not substitute poison.
Value *NewVal;
if (UserBB == OrigPreheader)
NewVal = OrigPreHeaderVal;
else if (SSA.HasValueForBlock(UserBB))
NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
else
- NewVal = UndefValue::get(OrigHeaderVal->getType());
+ NewVal = PoisonValue::get(OrigHeaderVal->getType());
DVR->replaceVariableLocationOp(OrigHeaderVal, NewVal);
}
}
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 44fdfe530178..d8298646e18d 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -777,8 +777,8 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
- "Canonicalize natural loops", false, false)
+INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops",
+ false, true)
// Publicly exposed interface to pass...
char &llvm::LoopSimplifyID = LoopSimplify::ID;
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 70047273c3b9..45915c10107b 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1208,6 +1208,23 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src,
return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select");
}
+Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src,
+ const RecurrenceDescriptor &Desc) {
+ assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind(
+ Desc.getRecurrenceKind()) &&
+ "Unexpected reduction kind");
+ Value *StartVal = Desc.getRecurrenceStartValue();
+ Value *Sentinel = Desc.getSentinelValue();
+ Value *MaxRdx = Src->getType()->isVectorTy()
+ ? Builder.CreateIntMaxReduce(Src, true)
+ : Src;
+ // Correct the final reduction result back to the start value if the maximum
+ // reduction is sentinel value.
+ Value *Cmp =
+ Builder.CreateCmp(CmpInst::ICMP_NE, MaxRdx, Sentinel, "rdx.select.cmp");
+ return Builder.CreateSelect(Cmp, MaxRdx, StartVal, "rdx.select");
+}
+
Value *llvm::getReductionIdentity(Intrinsic::ID RdxID, Type *Ty,
FastMathFlags Flags) {
bool Negative = false;
@@ -1315,6 +1332,8 @@ Value *llvm::createReduction(IRBuilderBase &B,
RecurKind RK = Desc.getRecurrenceKind();
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
return createAnyOfReduction(B, Src, Desc, OrigPhi);
+ if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
+ return createFindLastIVReduction(B, Src, Desc);
return createSimpleReduction(B, Src, RK);
}
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 8f8c40a4e73b..5ee551e6f0cc 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -26,6 +26,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
using namespace llvm;
@@ -278,6 +279,9 @@ bool runImpl(LoopInfo *LI, LoopAccessInfoManager &LAIs, DominatorTree *DT,
if (!LAI.hasConvergentOp() &&
(LAI.getNumRuntimePointerChecks() ||
!LAI.getPSE().getPredicate().isAlwaysTrue())) {
+ if (!L->isLCSSAForm(*DT))
+ formLCSSARecursively(*L, *DT, LI, SE);
+
LoopVersioning LVer(LAI, LAI.getRuntimePointerChecking()->getChecks(), L,
LI, DT, SE);
LVer.versionLoop();
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 4225e7e80fda..81aa7ce1cfe6 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -147,6 +147,16 @@ static bool refineInstruction(SCCPSolver &Solver,
Changed = true;
}
}
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&Inst)) {
+ if (GEP->hasNoUnsignedWrap() || !GEP->hasNoUnsignedSignedWrap())
+ return false;
+
+ if (all_of(GEP->indices(),
+ [&](Value *V) { return GetRange(V).isAllNonNegative(); })) {
+ GEP->setNoWrapFlags(GEP->getNoWrapFlags() |
+ GEPNoWrapFlags::noUnsignedWrap());
+ Changed = true;
+ }
}
return Changed;
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 597d470f18ff..4bf4acd6330f 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -412,9 +412,13 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
updateDebugInfo(SI);
SSA.AddAvailableValue(BB, SI->getOperand(0));
- } else
+ } else if (auto *AI = dyn_cast<AllocaInst>(User)) {
+ // We treat AllocaInst as a store of an getValueToUseForAlloca value.
+ SSA.AddAvailableValue(BB, getValueToUseForAlloca(AI));
+ } else {
// Otherwise it is a load, queue it to rewrite as a live-in load.
LiveInLoads.push_back(cast<LoadInst>(User));
+ }
BlockUses.clear();
continue;
}
@@ -422,7 +426,7 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
// Otherwise, check to see if this block is all loads.
bool HasStore = false;
for (Instruction *I : BlockUses) {
- if (isa<StoreInst>(I)) {
+ if (isa<StoreInst>(I) || isa<AllocaInst>(I)) {
HasStore = true;
break;
}
@@ -468,6 +472,12 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
// Remember that this is the active value in the block.
StoredValue = SI->getOperand(0);
+ } else if (auto *AI = dyn_cast<AllocaInst>(&I)) {
+ // Check if this an alloca, in which case we treat it as a store of
+ // getValueToUseForAlloca.
+ if (!isInstInList(AI, Insts))
+ continue;
+ StoredValue = getValueToUseForAlloca(AI);
}
}
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 791d52882397..0bc752a92340 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1816,7 +1816,7 @@ bool SCEVExpander::hasRelatedExistingExpansion(const SCEV *S,
// Look for suitable value in simple conditions at the loop exits.
for (BasicBlock *BB : ExitingBlocks) {
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
Instruction *LHS, *RHS;
if (!match(BB->getTerminator(),
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c7e814bced57..febc5682c212 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -285,7 +285,7 @@ class SimplifyCFGOpt {
bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
IRBuilder<> &Builder);
- bool hoistCommonCodeFromSuccessors(Instruction *TI, bool EqTermsOnly);
+ bool hoistCommonCodeFromSuccessors(Instruction *TI, bool AllInstsEqOnly);
bool hoistSuccIdenticalTerminatorToSwitchOrIf(
Instruction *TI, Instruction *I1,
SmallVectorImpl<Instruction *> &OtherSuccTIs);
@@ -1772,13 +1772,84 @@ static bool isSafeCheapLoadStore(const Instruction *I,
getLoadStoreAlignment(I) < Value::MaximumAlignment;
}
+namespace {
+
+// LockstepReverseIterator - Iterates through instructions
+// in a set of blocks in reverse order from the first non-terminator.
+// For example (assume all blocks have size n):
+// LockstepReverseIterator I([B1, B2, B3]);
+// *I-- = [B1[n], B2[n], B3[n]];
+// *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+// *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+// ...
+class LockstepReverseIterator {
+ ArrayRef<BasicBlock *> Blocks;
+ SmallVector<Instruction *, 4> Insts;
+ bool Fail;
+
+public:
+ LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
+ reset();
+ }
+
+ void reset() {
+ Fail = false;
+ Insts.clear();
+ for (auto *BB : Blocks) {
+ Instruction *Inst = BB->getTerminator();
+ for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+ Inst = Inst->getPrevNode();
+ if (!Inst) {
+ // Block wasn't big enough.
+ Fail = true;
+ return;
+ }
+ Insts.push_back(Inst);
+ }
+ }
+
+ bool isValid() const { return !Fail; }
+
+ void operator--() {
+ if (Fail)
+ return;
+ for (auto *&Inst : Insts) {
+ for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+ Inst = Inst->getPrevNode();
+ // Already at beginning of block.
+ if (!Inst) {
+ Fail = true;
+ return;
+ }
+ }
+ }
+
+ void operator++() {
+ if (Fail)
+ return;
+ for (auto *&Inst : Insts) {
+ for (Inst = Inst->getNextNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+ Inst = Inst->getNextNode();
+ // Already at end of block.
+ if (!Inst) {
+ Fail = true;
+ return;
+ }
+ }
+ }
+
+ ArrayRef<Instruction *> operator*() const { return Insts; }
+};
+
+} // end anonymous namespace
+
/// Hoist any common code in the successor blocks up into the block. This
-/// function guarantees that BB dominates all successors. If EqTermsOnly is
-/// given, only perform hoisting in case both blocks only contain a terminator.
-/// In that case, only the original BI will be replaced and selects for PHIs are
-/// added.
+/// function guarantees that BB dominates all successors. If AllInstsEqOnly is
+/// given, only perform hoisting in case all successors blocks contain matching
+/// instructions only. In that case, all instructions can be hoisted and the
+/// original branch will be replaced and selects for PHIs are added.
bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
- bool EqTermsOnly) {
+ bool AllInstsEqOnly) {
// This does very trivial matching, with limited scanning, to find identical
// instructions in the two blocks. In particular, we don't want to get into
// O(N1*N2*...) situations here where Ni are the sizes of these successors. As
@@ -1807,17 +1878,35 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
SuccIterPairs.push_back(SuccIterPair(SuccItr, 0));
}
- // Check if only hoisting terminators is allowed. This does not add new
- // instructions to the hoist location.
- if (EqTermsOnly) {
- // Skip any debug intrinsics, as they are free to hoist.
- for (auto &SuccIter : make_first_range(SuccIterPairs)) {
- auto *INonDbg = &*skipDebugIntrinsics(SuccIter);
- if (!INonDbg->isTerminator())
- return false;
+ if (AllInstsEqOnly) {
+ // Check if all instructions in the successor blocks match. This allows
+ // hoisting all instructions and removing the blocks we are hoisting from,
+ // so does not add any new instructions.
+ SmallVector<BasicBlock *> Succs = to_vector(successors(BB));
+ // Check if sizes and terminators of all successors match.
+ bool AllSame = none_of(Succs, [&Succs](BasicBlock *Succ) {
+ Instruction *Term0 = Succs[0]->getTerminator();
+ Instruction *Term = Succ->getTerminator();
+ return !Term->isSameOperationAs(Term0) ||
+ !equal(Term->operands(), Term0->operands()) ||
+ Succs[0]->size() != Succ->size();
+ });
+ if (!AllSame)
+ return false;
+ if (AllSame) {
+ LockstepReverseIterator LRI(Succs);
+ while (LRI.isValid()) {
+ Instruction *I0 = (*LRI)[0];
+ if (any_of(*LRI, [I0](Instruction *I) {
+ return !areIdenticalUpToCommutativity(I0, I);
+ })) {
+ return false;
+ }
+ --LRI;
+ }
}
- // Now we know that we only need to hoist debug intrinsics and the
- // terminator. Let the loop below handle those 2 cases.
+ // Now we know that all instructions in all successors can be hoisted. Let
+ // the loop below handle the hoisting.
}
// Count how many instructions were not hoisted so far. There's a limit on how
@@ -2350,81 +2439,6 @@ static void sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
}
}
-namespace {
-
- // LockstepReverseIterator - Iterates through instructions
- // in a set of blocks in reverse order from the first non-terminator.
- // For example (assume all blocks have size n):
- // LockstepReverseIterator I([B1, B2, B3]);
- // *I-- = [B1[n], B2[n], B3[n]];
- // *I-- = [B1[n-1], B2[n-1], B3[n-1]];
- // *I-- = [B1[n-2], B2[n-2], B3[n-2]];
- // ...
- class LockstepReverseIterator {
- ArrayRef<BasicBlock*> Blocks;
- SmallVector<Instruction*,4> Insts;
- bool Fail;
-
- public:
- LockstepReverseIterator(ArrayRef<BasicBlock*> Blocks) : Blocks(Blocks) {
- reset();
- }
-
- void reset() {
- Fail = false;
- Insts.clear();
- for (auto *BB : Blocks) {
- Instruction *Inst = BB->getTerminator();
- for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
- Inst = Inst->getPrevNode();
- if (!Inst) {
- // Block wasn't big enough.
- Fail = true;
- return;
- }
- Insts.push_back(Inst);
- }
- }
-
- bool isValid() const {
- return !Fail;
- }
-
- void operator--() {
- if (Fail)
- return;
- for (auto *&Inst : Insts) {
- for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
- Inst = Inst->getPrevNode();
- // Already at beginning of block.
- if (!Inst) {
- Fail = true;
- return;
- }
- }
- }
-
- void operator++() {
- if (Fail)
- return;
- for (auto *&Inst : Insts) {
- for (Inst = Inst->getNextNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
- Inst = Inst->getNextNode();
- // Already at end of block.
- if (!Inst) {
- Fail = true;
- return;
- }
- }
- }
-
- ArrayRef<Instruction*> operator * () const {
- return Insts;
- }
- };
-
-} // end anonymous namespace
-
/// Check whether BB's predecessors end with unconditional branches. If it is
/// true, sink any common code from the predecessors to BB.
static bool sinkCommonCodeFromPredecessors(BasicBlock *BB,
@@ -6517,8 +6531,8 @@ SwitchLookupTable::SwitchLookupTable(
uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
TableContents[Idx] = CaseRes;
- if (CaseRes != SingleValue)
- SingleValue = nullptr;
+ if (SingleValue && !isa<PoisonValue>(CaseRes) && CaseRes != SingleValue)
+ SingleValue = isa<PoisonValue>(SingleValue) ? CaseRes : nullptr;
}
// Fill in any holes in the table with the default result.
@@ -6531,7 +6545,10 @@ SwitchLookupTable::SwitchLookupTable(
TableContents[I] = DefaultValue;
}
- if (DefaultValue != SingleValue)
+ // If the default value is poison, all the holes are poison.
+ bool DefaultValueIsPoison = isa<PoisonValue>(DefaultValue);
+
+ if (DefaultValue != SingleValue && !DefaultValueIsPoison)
SingleValue = nullptr;
}
@@ -6555,6 +6572,16 @@ SwitchLookupTable::SwitchLookupTable(
// Check if there is the same distance between two consecutive values.
for (uint64_t I = 0; I < TableSize; ++I) {
ConstantInt *ConstVal = dyn_cast<ConstantInt>(TableContents[I]);
+
+ if (!ConstVal && isa<PoisonValue>(TableContents[I])) {
+ // This is an poison, so it's (probably) a lookup table hole.
+ // To prevent any regressions from before we switched to using poison as
+ // the default value, holes will fall back to using the first value.
+ // This can be removed once we add proper handling for poisons in lookup
+ // tables.
+ ConstVal = dyn_cast<ConstantInt>(Values[0].second);
+ }
+
if (!ConstVal) {
// This is an undef. We could deal with it, but undefs in lookup tables
// are very seldom. It's probably not worth the additional complexity.
@@ -6989,8 +7016,8 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
// If the table has holes but the default destination doesn't produce any
// constant results, the lookup table entries corresponding to the holes will
- // contain undefined values.
- bool AllHolesAreUndefined = TableHasHoles && !HasDefaultResults;
+ // contain poison.
+ bool AllHolesArePoison = TableHasHoles && !HasDefaultResults;
// If the default destination doesn't produce a constant result but is still
// reachable, and the lookup table has holes, we need to use a mask to
@@ -6998,7 +7025,7 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
// to the default case.
// The mask is unnecessary if the table has holes but the default destination
// is unreachable, as in that case the holes must also be unreachable.
- bool NeedMask = AllHolesAreUndefined && DefaultIsReachable;
+ bool NeedMask = AllHolesArePoison && DefaultIsReachable;
if (NeedMask) {
// As an extra penalty for the validity test we require more cases.
if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark).
@@ -7143,9 +7170,11 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
for (PHINode *PHI : PHIs) {
const ResultListTy &ResultList = ResultLists[PHI];
+ Type *ResultType = ResultList.begin()->second->getType();
+
// Use any value to fill the lookup table holes.
Constant *DV =
- AllHolesAreUndefined ? ResultLists[PHI][0].second : DefaultResults[PHI];
+ AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI];
StringRef FuncName = Fn->getName();
SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV,
DL, FuncName);
@@ -7474,9 +7503,6 @@ static bool simplifySwitchOfCmpIntrinsic(SwitchInst *SI, IRBuilderBase &Builder,
/// IncomingValue and add it in the Wrapper so isEqual can do O(1) checking
/// of the incoming values.
struct SwitchSuccWrapper {
- // Keep so we can use SwitchInst::setSuccessor to do the replacement. It won't
- // be important to equality though.
- unsigned SuccNum;
BasicBlock *Dest;
DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> *PhiPredIVs;
};
@@ -7563,6 +7589,7 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
SmallPtrSet<PHINode *, 8> Phis;
SmallPtrSet<BasicBlock *, 8> Seen;
DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> PhiPredIVs;
+ DenseMap<BasicBlock *, SmallVector<unsigned, 4>> BBToSuccessorIndexes;
SmallVector<SwitchSuccWrapper> Cases;
Cases.reserve(SI->getNumSuccessors());
@@ -7575,8 +7602,9 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
continue;
// FIXME: This case needs some extra care because the terminators other than
- // SI need to be updated.
- if (BB->hasNPredecessorsOrMore(2))
+ // SI need to be updated. For now, consider only backedges to the SI.
+ if (BB->hasNPredecessorsOrMore(4) ||
+ BB->getUniquePredecessor() != SI->getParent())
continue;
// FIXME: Relax that the terminator is a BranchInst by checking for equality
@@ -7591,8 +7619,11 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
for (BasicBlock *Succ : BI->successors())
for (PHINode &Phi : Succ->phis())
Phis.insert(&Phi);
+ // Add the successor only if not previously visited.
+ Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs});
}
- Cases.emplace_back(SwitchSuccWrapper{I, BB, &PhiPredIVs});
+
+ BBToSuccessorIndexes[BB].emplace_back(I);
}
// Precompute a data structure to improve performance of isEqual for
@@ -7627,7 +7658,9 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
// We know that SI's parent BB no longer dominates the old case successor
// since we are making it dead.
Updates.push_back({DominatorTree::Delete, SI->getParent(), SSW.Dest});
- SI->setSuccessor(SSW.SuccNum, (*It)->Dest);
+ const auto &Successors = BBToSuccessorIndexes.at(SSW.Dest);
+ for (unsigned Idx : Successors)
+ SI->setSuccessor(Idx, (*It)->Dest);
MadeChange = true;
}
}
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 7fca1a6aa526..f05d32d980e5 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -2164,16 +2164,14 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
!NarrowDefRHS->isNonNegative())
return;
- auto UpdateRangeFromCondition = [&] (Value *Condition,
- bool TrueDest) {
- CmpInst::Predicate Pred;
+ auto UpdateRangeFromCondition = [&](Value *Condition, bool TrueDest) {
+ CmpPredicate Pred;
Value *CmpRHS;
if (!match(Condition, m_ICmp(Pred, m_Specific(NarrowDefLHS),
m_Value(CmpRHS))))
return;
- CmpInst::Predicate P =
- TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
+ CmpPredicate P = TrueDest ? Pred : ICmpInst::getInverseCmpPredicate(Pred);
auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
auto CmpConstrainedLHSRange =
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index d85e0d994660..737818b7825c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -397,9 +397,8 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
// We have enough information to now generate the memcpy call to do the
// concatenation for us. Make a memcpy to copy the nul byte with align = 1.
- B.CreateMemCpy(
- CpyDst, Align(1), Src, Align(1),
- ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
+ B.CreateMemCpy(CpyDst, Align(1), Src, Align(1),
+ TLI->getAsSizeT(Len + 1, *B.GetInsertBlock()->getModule()));
return Dst;
}
@@ -590,26 +589,21 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
if (Len1 && Len2) {
return copyFlags(
*CI, emitMemCmp(Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()),
- std::min(Len1, Len2)),
+ TLI->getAsSizeT(std::min(Len1, Len2), *CI->getModule()),
B, DL, TLI));
}
// strcmp to memcmp
if (!HasStr1 && HasStr2) {
if (canTransformToMemCmp(CI, Str1P, Len2, DL))
- return copyFlags(
- *CI,
- emitMemCmp(Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2),
- B, DL, TLI));
+ return copyFlags(*CI, emitMemCmp(Str1P, Str2P,
+ TLI->getAsSizeT(Len2, *CI->getModule()),
+ B, DL, TLI));
} else if (HasStr1 && !HasStr2) {
if (canTransformToMemCmp(CI, Str2P, Len1, DL))
- return copyFlags(
- *CI,
- emitMemCmp(Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1),
- B, DL, TLI));
+ return copyFlags(*CI, emitMemCmp(Str1P, Str2P,
+ TLI->getAsSizeT(Len1, *CI->getModule()),
+ B, DL, TLI));
}
annotateNonNullNoUndefBasedOnAccess(CI, {0, 1});
@@ -676,19 +670,15 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
if (!HasStr1 && HasStr2) {
Len2 = std::min(Len2, Length);
if (canTransformToMemCmp(CI, Str1P, Len2, DL))
- return copyFlags(
- *CI,
- emitMemCmp(Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2),
- B, DL, TLI));
+ return copyFlags(*CI, emitMemCmp(Str1P, Str2P,
+ TLI->getAsSizeT(Len2, *CI->getModule()),
+ B, DL, TLI));
} else if (HasStr1 && !HasStr2) {
Len1 = std::min(Len1, Length);
if (canTransformToMemCmp(CI, Str2P, Len1, DL))
- return copyFlags(
- *CI,
- emitMemCmp(Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1),
- B, DL, TLI));
+ return copyFlags(*CI, emitMemCmp(Str1P, Str2P,
+ TLI->getAsSizeT(Len1, *CI->getModule()),
+ B, DL, TLI));
}
return nullptr;
@@ -722,15 +712,13 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
// We have enough information to now generate the memcpy call to do the
// copy for us. Make a memcpy to copy the nul byte with align = 1.
- CallInst *NewCI =
- B.CreateMemCpy(Dst, Align(1), Src, Align(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
+ CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
+ TLI->getAsSizeT(Len, *CI->getModule()));
mergeAttributesAndFlags(NewCI, *CI);
return Dst;
}
Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
// stpcpy(d,s) -> strcpy(d,s) if the result is not used.
@@ -749,10 +737,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
else
return nullptr;
- Type *PT = Callee->getFunctionType()->getParamType(0);
- Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
+ Value *LenV = TLI->getAsSizeT(Len, *CI->getModule());
Value *DstEnd = B.CreateInBoundsGEP(
- B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
+ B.getInt8Ty(), Dst, TLI->getAsSizeT(Len - 1, *CI->getModule()));
// We have enough information to now generate the memcpy call to do the
// copy for us. Make a memcpy to copy the nul byte with align = 1.
@@ -819,13 +806,11 @@ Value *LibCallSimplifier::optimizeStrLCpy(CallInst *CI, IRBuilderBase &B) {
return ConstantInt::get(CI->getType(), 0);
}
- Function *Callee = CI->getCalledFunction();
- Type *PT = Callee->getFunctionType()->getParamType(0);
// Transform strlcpy(D, S, N) to memcpy(D, S, N') where N' is the lower
// bound on strlen(S) + 1 and N, optionally followed by a nul store to
// D[N' - 1] if necessary.
CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
- ConstantInt::get(DL.getIntPtrType(PT), NBytes));
+ TLI->getAsSizeT(NBytes, *CI->getModule()));
mergeAttributesAndFlags(NewCI, *CI);
if (!NulTerm) {
@@ -844,7 +829,6 @@ Value *LibCallSimplifier::optimizeStrLCpy(CallInst *CI, IRBuilderBase &B) {
// otherwise.
Value *LibCallSimplifier::optimizeStringNCpy(CallInst *CI, bool RetEnd,
IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
Value *Dst = CI->getArgOperand(0);
Value *Src = CI->getArgOperand(1);
Value *Size = CI->getArgOperand(2);
@@ -921,11 +905,10 @@ Value *LibCallSimplifier::optimizeStringNCpy(CallInst *CI, bool RetEnd,
/*M=*/nullptr, /*AddNull=*/false);
}
- Type *PT = Callee->getFunctionType()->getParamType(0);
// st{p,r}ncpy(D, S, N) -> memcpy(align 1 D, align 1 S, N) when both
// S and N are constant.
CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
- ConstantInt::get(DL.getIntPtrType(PT), N));
+ TLI->getAsSizeT(N, *CI->getModule()));
mergeAttributesAndFlags(NewCI, *CI);
if (!RetEnd)
return Dst;
@@ -3432,10 +3415,9 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
return nullptr; // we found a format specifier, bail out.
// sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
- B.CreateMemCpy(
- Dest, Align(1), CI->getArgOperand(1), Align(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()),
- FormatStr.size() + 1)); // Copy the null byte.
+ B.CreateMemCpy(Dest, Align(1), CI->getArgOperand(1), Align(1),
+ // Copy the null byte.
+ TLI->getAsSizeT(FormatStr.size() + 1, *CI->getModule()));
return ConstantInt::get(CI->getType(), FormatStr.size());
}
@@ -3470,9 +3452,8 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
uint64_t SrcLen = GetStringLength(CI->getArgOperand(2));
if (SrcLen) {
- B.CreateMemCpy(
- Dest, Align(1), CI->getArgOperand(2), Align(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), SrcLen));
+ B.CreateMemCpy(Dest, Align(1), CI->getArgOperand(2), Align(1),
+ TLI->getAsSizeT(SrcLen, *CI->getModule()));
// Returns total number of characters written without null-character.
return ConstantInt::get(CI->getType(), SrcLen - 1);
} else if (Value *V = emitStpCpy(Dest, CI->getArgOperand(2), B, TLI)) {
@@ -3570,11 +3551,8 @@ Value *LibCallSimplifier::emitSnPrintfMemCpy(CallInst *CI, Value *StrArg,
Value *DstArg = CI->getArgOperand(0);
if (NCopy && StrArg)
// Transform the call to lvm.memcpy(dst, fmt, N).
- copyFlags(
- *CI,
- B.CreateMemCpy(
- DstArg, Align(1), StrArg, Align(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), NCopy)));
+ copyFlags(*CI, B.CreateMemCpy(DstArg, Align(1), StrArg, Align(1),
+ TLI->getAsSizeT(NCopy, *CI->getModule())));
if (N > Str.size())
// Return early when the whole format string, including the final nul,
@@ -3690,11 +3668,9 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
if (FormatStr.contains('%'))
return nullptr; // We found a format specifier.
- unsigned SizeTBits = TLI->getSizeTSize(*CI->getModule());
- Type *SizeTTy = IntegerType::get(CI->getContext(), SizeTBits);
return copyFlags(
*CI, emitFWrite(CI->getArgOperand(1),
- ConstantInt::get(SizeTTy, FormatStr.size()),
+ TLI->getAsSizeT(FormatStr.size(), *CI->getModule()),
CI->getArgOperand(0), B, DL, TLI));
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index f1568781252c..cb0b4641b649 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -666,7 +666,6 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
// Check whether we are able to set up outer loop induction.
if (!setupOuterLoopInductions()) {
reportVectorizationFailure("Unsupported outer loop Phi(s)",
- "Unsupported outer loop Phi(s)",
"UnsupportedPhi", ORE, TheLoop);
if (DoExtraAnalysis)
Result = false;
@@ -927,7 +926,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
auto *SE = PSE.getSE();
Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
- if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) {
if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
TheLoop)) {
reportVectorizationFailure("Found unvectorizable intrinsic",
@@ -962,7 +961,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
Type *T = ST->getValueOperand()->getType();
if (!VectorType::isValidElementType(T)) {
reportVectorizationFailure("Store instruction cannot be vectorized",
- "store instruction cannot be vectorized",
"CantVectorizeStore", ORE, TheLoop, ST);
return false;
}
@@ -976,7 +974,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
reportVectorizationFailure(
"nontemporal store instruction cannot be vectorized",
- "nontemporal store instruction cannot be vectorized",
"CantVectorizeNontemporalStore", ORE, TheLoop, ST);
return false;
}
@@ -991,7 +988,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
reportVectorizationFailure(
"nontemporal load instruction cannot be vectorized",
- "nontemporal load instruction cannot be vectorized",
"CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
return false;
}
@@ -1020,7 +1016,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
reportVectorizationFailure("Value cannot be used outside the loop",
- "value cannot be used outside the loop",
"ValueUsedOutsideLoop", ORE, TheLoop, &I);
return false;
}
@@ -1375,6 +1370,16 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
}
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+ // When vectorizing early exits, create predicates for the latch block only.
+ // The early exiting block must be a direct predecessor of the latch at the
+ // moment.
+ BasicBlock *Latch = TheLoop->getLoopLatch();
+ if (hasUncountableEarlyExit()) {
+ assert(
+ is_contained(predecessors(Latch), getUncountableEarlyExitingBlock()) &&
+ "Uncountable exiting block must be a direct predecessor of latch");
+ return BB == Latch;
+ }
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
@@ -1432,9 +1437,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(
bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!EnableIfConversion) {
reportVectorizationFailure("If-conversion is disabled",
- "if-conversion is disabled",
- "IfConversionDisabled",
- ORE, TheLoop);
+ "IfConversionDisabled", ORE, TheLoop);
return false;
}
@@ -1483,14 +1486,12 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (isa<SwitchInst>(BB->getTerminator())) {
if (TheLoop->isLoopExiting(BB)) {
reportVectorizationFailure("Loop contains an unsupported switch",
- "loop contains an unsupported switch",
"LoopContainsUnsupportedSwitch", ORE,
TheLoop, BB->getTerminator());
return false;
}
} else if (!isa<BranchInst>(BB->getTerminator())) {
reportVectorizationFailure("Loop contains an unsupported terminator",
- "loop contains an unsupported terminator",
"LoopContainsUnsupportedTerminator", ORE,
TheLoop, BB->getTerminator());
return false;
@@ -1500,8 +1501,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (blockNeedsPredication(BB) &&
!blockCanBePredicated(BB, SafePointers, MaskedOp)) {
reportVectorizationFailure(
- "Control flow cannot be substituted for a select",
- "control flow cannot be substituted for a select", "NoCFGForSelect",
+ "Control flow cannot be substituted for a select", "NoCFGForSelect",
ORE, TheLoop, BB->getTerminator());
return false;
}
@@ -1691,8 +1691,6 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
} else if (!IsSafeOperation(&I)) {
reportVectorizationFailure("Early exit loop contains operations that "
"cannot be speculatively executed",
- "Early exit loop contains operations that "
- "cannot be speculatively executed",
"UnsafeOperationsEarlyExitLoop", ORE,
TheLoop);
return false;
@@ -1754,9 +1752,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
if (!canVectorizeOuterLoop()) {
reportVectorizationFailure("Unsupported outer loop",
- "unsupported outer loop",
- "UnsupportedOuterLoop",
- ORE, TheLoop);
+ "UnsupportedOuterLoop", ORE, TheLoop);
// TODO: Implement DoExtraAnalysis when subsequent legal checks support
// outer loops.
return false;
@@ -1788,13 +1784,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
HasUncountableEarlyExit = false;
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
+ HasUncountableEarlyExit = true;
if (!isVectorizableEarlyExitLoop()) {
+ UncountableExitingBlocks.clear();
+ HasUncountableEarlyExit = false;
if (DoExtraAnalysis)
Result = false;
else
return false;
- } else
- HasUncountableEarlyExit = true;
+ }
}
// Go over each instruction and look at memory deps.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index fbcf181a45a6..26a2de8c8097 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -222,21 +222,24 @@ public:
VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
const Twine &Name = "") {
- return tryInsertInstruction(new VPInstruction(
- Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name));
+ return tryInsertInstruction(
+ new VPInstruction(Ptr, Offset, GEPNoWrapFlags::none(), DL, Name));
}
VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
const Twine &Name = "") {
- return tryInsertInstruction(new VPInstruction(
- Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name));
+ return tryInsertInstruction(
+ new VPInstruction(Ptr, Offset, GEPNoWrapFlags::inBounds(), DL, Name));
}
+ /// Convert the input value \p Current to the corresponding value of an
+ /// induction with \p Start and \p Step values, using \p Start + \p Current *
+ /// \p Step.
VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
FPMathOperator *FPBinOp, VPValue *Start,
- VPCanonicalIVPHIRecipe *CanonicalIV,
- VPValue *Step, const Twine &Name = "") {
+ VPValue *Current, VPValue *Step,
+ const Twine &Name = "") {
return tryInsertInstruction(
- new VPDerivedIVRecipe(Kind, FPBinOp, Start, CanonicalIV, Step, Name));
+ new VPDerivedIVRecipe(Kind, FPBinOp, Start, Current, Step, Name));
}
VPScalarCastRecipe *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3c7c044a0427..f2f8a85b7cc2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -385,6 +385,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
cl::Hidden,
cl::desc("Try wider VFs if they enable the use of vector variants"));
+static cl::opt<bool> EnableEarlyExitVectorization(
+ "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Enable vectorization of early exit loops with uncountable exits."));
+
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -474,7 +479,8 @@ public:
AC(AC), ORE(ORE), VF(VecWidth),
MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
- PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+ VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -487,12 +493,11 @@ public:
/// on, while the old loop will be used as the scalar remainder. Control flow
/// is generated around the vectorized (and scalar epilogue) loops consisting
/// of various checks and bypasses. Return the pre-header block of the new
- /// loop and the start value for the canonical induction, if it is != 0. The
- /// latter is the case when vectorizing the epilogue loop. In the case of
- /// epilogue vectorization, this function is overriden to handle the more
- /// complex control flow around the loops. \p ExpandedSCEVs is used to
- /// look up SCEV expansions for expressions needed during skeleton creation.
- virtual std::pair<BasicBlock *, Value *>
+ /// loop. In the case of epilogue vectorization, this function is overriden to
+ /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
+ /// used to look up SCEV expansions for expressions needed during skeleton
+ /// creation.
+ virtual BasicBlock *
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
/// Fix the vectorized code, taking care of header phi's, and more.
@@ -513,18 +518,6 @@ public:
/// Fix the non-induction PHIs in \p Plan.
void fixNonInductionPHIs(VPTransformState &State);
- /// Create a new phi node for the induction variable \p OrigPhi to resume
- /// iteration count in the scalar epilogue, from where the vectorized loop
- /// left off. \p Step is the SCEV-expanded induction step to use. In cases
- /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
- /// and the resume values can come from an additional bypass block, the \p
- /// AdditionalBypass pair provides information about the bypass block and the
- /// end value on the edge from bypass to this loop.
- PHINode *createInductionResumeValue(
- PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
- ArrayRef<BasicBlock *> BypassBlocks,
- std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
-
/// Returns the original loop trip count.
Value *getTripCount() const { return TripCount; }
@@ -533,6 +526,20 @@ public:
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ // Retrieve the additional bypass value associated with an original
+ /// induction header phi.
+ Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
+ return Induction2AdditionalBypassValue.at(OrigPhi);
+ }
+
+ /// Return the additional bypass block which targets the scalar loop by
+ /// skipping the epilogue loop after completing the main loop.
+ BasicBlock *getAdditionalBypassBlock() const {
+ assert(AdditionalBypassBlock &&
+ "Trying to access AdditionalBypassBlock but it has not been set");
+ return AdditionalBypassBlock;
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -566,21 +573,21 @@ protected:
/// vector loop preheader, middle block and scalar preheader.
void createVectorLoopSkeleton(StringRef Prefix);
- /// Create new phi nodes for the induction variables to resume iteration count
- /// in the scalar epilogue, from where the vectorized loop left off.
- /// In cases where the loop skeleton is more complicated (eg. epilogue
- /// vectorization) and the resume values can come from an additional bypass
- /// block, the \p AdditionalBypass pair provides information about the bypass
- /// block and the end value on the edge from bypass to this loop.
- void createInductionResumeValues(
- const SCEV2ValueTy &ExpandedSCEVs,
- std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+ /// Create and record the values for induction variables to resume coming from
+ /// the additional bypass block.
+ void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
+ Value *MainVectorTripCount);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
virtual void printDebugTracesAtStart() {}
virtual void printDebugTracesAtEnd() {}
+ /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
+ /// vector preheader and its predecessor, also connecting the new block to the
+ /// scalar preheader.
+ void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
+
/// The original loop.
Loop *OrigLoop;
@@ -664,7 +671,21 @@ protected:
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
+ /// Mapping of induction phis to their additional bypass values. They
+ /// need to be added as operands to phi nodes in the scalar loop preheader
+ /// after the epilogue skeleton has been created.
+ DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
+
+ /// The additional bypass block which conditionally skips over the epilogue
+ /// loop after executing the main loop. Needed to resume inductions and
+ /// reductions during epilogue vectorization.
+ BasicBlock *AdditionalBypassBlock = nullptr;
+
VPlan &Plan;
+
+ /// The vector preheader block of \p Plan, used as target for check blocks
+ /// introduced during skeleton creation.
+ VPBlockBase *VectorPHVPB;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -681,10 +702,13 @@ struct EpilogueLoopVectorizationInfo {
BasicBlock *MemSafetyCheck = nullptr;
Value *TripCount = nullptr;
Value *VectorTripCount = nullptr;
+ VPlan &EpiloguePlan;
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
- ElementCount EVF, unsigned EUF)
- : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
+ ElementCount EVF, unsigned EUF,
+ VPlan &EpiloguePlan)
+ : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
+ EpiloguePlan(EpiloguePlan) {
assert(EUF == 1 &&
"A high UF for the epilogue loop is likely not beneficial.");
}
@@ -714,15 +738,15 @@ public:
// Override this function to handle the more complex control flow around the
// three loops.
- std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
- const SCEV2ValueTy &ExpandedSCEVs) final {
+ BasicBlock *
+ createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
}
/// The interface for creating a vectorized skeleton using one of two
/// different strategies, each corresponding to one execution of the vplan
/// as described above.
- virtual std::pair<BasicBlock *, Value *>
+ virtual BasicBlock *
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
/// Holds and updates state information required to vectorize the main loop
@@ -751,7 +775,7 @@ public:
EPI, LVL, CM, BFI, PSI, Check, Plan) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
- std::pair<BasicBlock *, Value *>
+ BasicBlock *
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
protected:
@@ -786,7 +810,7 @@ public:
}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (ie the second pass of vplan execution).
- std::pair<BasicBlock *, Value *>
+ BasicBlock *
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
protected:
@@ -1214,8 +1238,8 @@ public:
return false;
// Get the source and destination types of the truncate.
- Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
- Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
+ Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
+ Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
// If the truncate is free for the given types, return false. Replacing a
// free truncate with an induction variable would add an induction variable
@@ -1350,9 +1374,10 @@ public:
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
return false;
}
- // If we might exit from anywhere but the latch, must run the exiting
- // iteration in scalar form.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ // If we might exit from anywhere but the latch and early exit vectorization
+ // is disabled, we must run the exiting iteration in scalar form.
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
+ !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
"from latch block\n");
return true;
@@ -1706,7 +1731,8 @@ private:
bool needsExtract(Value *V, ElementCount VF) const {
Instruction *I = dyn_cast<Instruction>(V);
if (VF.isScalar() || !I || !TheLoop->contains(I) ||
- TheLoop->isLoopInvariant(I))
+ TheLoop->isLoopInvariant(I) ||
+ getWideningDecision(I, VF) == CM_Scalarize)
return false;
// Assume we can vectorize V (and hence we need extraction) if the
@@ -2428,6 +2454,21 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
return VectorTripCount;
}
+void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
+ VPBlockBase *ScalarPH = Plan.getScalarPreheader();
+ VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
+ if (PreVectorPH->getNumSuccessors() != 1) {
+ assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
+ assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
+ "Unexpected successor");
+ VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
+ VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
+ PreVectorPH = CheckVPIRBB;
+ }
+ VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
+ PreVectorPH->swapSuccessors();
+}
+
void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
Value *Count = getTripCount();
// Reuse existing vector loop preheader for TC checks.
@@ -2502,14 +2543,15 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass");
- // Update dominator for Bypass & LoopExit (if needed).
- DT->changeImmediateDominator(Bypass, TCCheckBlock);
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
LoopBypassBlocks.push_back(TCCheckBlock);
+
+ // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
+ introduceCheckBlockInVPlan(TCCheckBlock);
}
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
@@ -2526,6 +2568,8 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
"Should already be a bypass block due to iteration count check");
LoopBypassBlocks.push_back(SCEVCheckBlock);
AddedSafetyChecks = true;
+
+ introduceCheckBlockInVPlan(SCEVCheckBlock);
return SCEVCheckBlock;
}
@@ -2562,80 +2606,40 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
AddedSafetyChecks = true;
+ introduceCheckBlockInVPlan(MemCheckBlock);
return MemCheckBlock;
}
+/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
+/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
+/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
+/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
+static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+ VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
+ for (auto &R : make_early_inc_range(*VPBB)) {
+ assert(!R.isPhi() && "Tried to move phi recipe to end of block");
+ R.moveBefore(*IRVPBB, IRVPBB->end());
+ }
+
+ VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
+ // VPBB is now dead and will be cleaned up when the plan gets destroyed.
+}
+
void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
- assert((OrigLoop->getUniqueExitBlock() ||
+ assert((OrigLoop->getUniqueLatchExitBlock() ||
Cost->requiresScalarEpilogue(VF.isVector())) &&
- "multiple exit loop without required epilogue?");
+ "loops not exiting via the latch without required epilogue?");
LoopMiddleBlock =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
LI, nullptr, Twine(Prefix) + "middle.block");
+ replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
LoopScalarPreHeader =
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
nullptr, Twine(Prefix) + "scalar.ph");
-}
-
-PHINode *InnerLoopVectorizer::createInductionResumeValue(
- PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
- ArrayRef<BasicBlock *> BypassBlocks,
- std::pair<BasicBlock *, Value *> AdditionalBypass) {
- Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
- assert(VectorTripCount && "Expected valid arguments");
-
- Instruction *OldInduction = Legal->getPrimaryInduction();
- Value *EndValue = nullptr;
- Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
- if (OrigPhi == OldInduction) {
- // We know what the end value is.
- EndValue = VectorTripCount;
- } else {
- IRBuilder<> B(LoopVectorPreHeader->getTerminator());
-
- // Fast-math-flags propagate from the original induction instruction.
- if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
- B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
-
- EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
- Step, II.getKind(), II.getInductionBinOp());
- EndValue->setName("ind.end");
-
- // Compute the end value for the additional bypass (if applicable).
- if (AdditionalBypass.first) {
- B.SetInsertPoint(AdditionalBypass.first,
- AdditionalBypass.first->getFirstInsertionPt());
- EndValueFromAdditionalBypass =
- emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
- Step, II.getKind(), II.getInductionBinOp());
- EndValueFromAdditionalBypass->setName("ind.end");
- }
- }
-
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getFirstNonPHIIt());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
-
- // The new PHI merges the original incoming value, in case of a bypass,
- // or the value at the end of the vectorized loop.
- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
-
- // Fix the scalar body counter (PHI node).
- // The old induction's phi node in the scalar body needs the truncated
- // value.
- for (BasicBlock *BB : BypassBlocks)
- BCResumeVal->addIncoming(II.getStartValue(), BB);
-
- if (AdditionalBypass.first)
- BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
- EndValueFromAdditionalBypass);
- return BCResumeVal;
+ replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2652,31 +2656,66 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
return I->second;
}
-void InnerLoopVectorizer::createInductionResumeValues(
- const SCEV2ValueTy &ExpandedSCEVs,
- std::pair<BasicBlock *, Value *> AdditionalBypass) {
- assert(((AdditionalBypass.first && AdditionalBypass.second) ||
- (!AdditionalBypass.first && !AdditionalBypass.second)) &&
- "Inconsistent information about additional bypass.");
- // We are going to resume the execution of the scalar loop.
- // Go over all of the induction variables that we found and fix the
- // PHIs that are left in the scalar version of the loop.
- // The starting values of PHI nodes depend on the counter of the last
- // iteration in the vectorized loop.
- // If we come from a bypass edge then we need to start from the original
- // start value.
+/// Knowing that loop \p L executes a single vector iteration, add instructions
+/// that will get simplified and thus should not have any cost to \p
+/// InstsToIgnore.
+static void addFullyUnrolledInstructionsToIgnore(
+ Loop *L, const LoopVectorizationLegality::InductionList &IL,
+ SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
+ auto *Cmp = L->getLatchCmpInst();
+ if (Cmp)
+ InstsToIgnore.insert(Cmp);
+ for (const auto &KV : IL) {
+ // Extract the key by hand so that it can be used in the lambda below. Note
+ // that captured structured bindings are a C++20 extension.
+ const PHINode *IV = KV.first;
+
+ // Get next iteration value of the induction variable.
+ Instruction *IVInst =
+ cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
+ if (all_of(IVInst->users(),
+ [&](const User *U) { return U == IV || U == Cmp; }))
+ InstsToIgnore.insert(IVInst);
+ }
+}
+
+void InnerLoopVectorizer::createInductionAdditionalBypassValues(
+ const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
+ assert(MainVectorTripCount && "Must have bypass information");
+
+ Instruction *OldInduction = Legal->getPrimaryInduction();
+ IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
+ getAdditionalBypassBlock()->getFirstInsertionPt());
for (const auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
const InductionDescriptor &II = InductionEntry.second;
- PHINode *BCResumeVal = createInductionResumeValue(
- OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
- AdditionalBypass);
- OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
+ Value *Step = getExpandedStep(II, ExpandedSCEVs);
+ // For the primary induction the additional bypass end value is known.
+ // Otherwise it is computed.
+ Value *EndValueFromAdditionalBypass = MainVectorTripCount;
+ if (OrigPhi != OldInduction) {
+ auto *BinOp = II.getInductionBinOp();
+ // Fast-math-flags propagate from the original induction instruction.
+ if (isa_and_nonnull<FPMathOperator>(BinOp))
+ BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
+
+ // Compute the end value for the additional bypass.
+ EndValueFromAdditionalBypass =
+ emitTransformedIndex(BypassBuilder, MainVectorTripCount,
+ II.getStartValue(), Step, II.getKind(), BinOp);
+ EndValueFromAdditionalBypass->setName("ind.end");
+ }
+
+ // Store the bypass value here, as it needs to be added as operand to its
+ // scalar preheader phi node after the epilogue skeleton has been created.
+ // TODO: Directly add as extra operand to the VPResumePHI recipe.
+ assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
+ "entry for OrigPhi already exits");
+ Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
}
}
-std::pair<BasicBlock *, Value *>
-InnerLoopVectorizer::createVectorizedLoopSkeleton(
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) {
/*
In this function we generate a new loop. The new loop will contain
@@ -2733,10 +2772,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
// faster.
emitMemRuntimeChecks(LoopScalarPreHeader);
- // Emit phis for the new starting index of the scalar loop.
- createInductionResumeValues(ExpandedSCEVs);
-
- return {LoopVectorPreHeader, nullptr};
+ return LoopVectorPreHeader;
}
// Fix up external users of the induction variable. At this point, we are
@@ -2753,8 +2789,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.
- assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
-
DenseMap<Value *, Value *> MissingVals;
Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
@@ -2808,6 +2842,18 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
}
}
+ assert((MissingVals.empty() ||
+ all_of(MissingVals,
+ [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
+ return all_of(
+ predecessors(cast<Instruction>(P.first)->getParent()),
+ [MiddleBlock, this](BasicBlock *Pred) {
+ return Pred == MiddleBlock ||
+ Pred == OrigLoop->getLoopLatch();
+ });
+ })) &&
+ "Expected escaping values from latch/middle.block only");
+
for (auto &I : MissingVals) {
PHINode *PHI = cast<PHINode>(I.first);
// One corner case we have to handle is two IVs "chasing" each-other,
@@ -3411,14 +3457,14 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
}
InstructionCost SafeDivisorCost = 0;
- auto *VecTy = ToVectorTy(I->getType(), VF);
+ auto *VecTy = toVectorTy(I->getType(), VF);
// The cost of the select guard to ensure all lanes are well defined
// after we speculate above any internal control flow.
- SafeDivisorCost += TTI.getCmpSelInstrCost(
- Instruction::Select, VecTy,
- ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ SafeDivisorCost +=
+ TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
+ toVectorTy(Type::getInt1Ty(I->getContext()), VF),
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
@@ -3585,10 +3631,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// Start with the conditional branches exiting the loop. If the branch
// condition is an instruction contained in the loop that is only used by the
- // branch, it is uniform.
+ // branch, it is uniform. Note conditions from uncountable early exits are not
+ // uniform.
SmallVector<BasicBlock *> Exiting;
TheLoop->getExitingBlocks(Exiting);
for (BasicBlock *E : Exiting) {
+ if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
+ continue;
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
AddToWorklistIfAllowed(Cmp);
@@ -4147,7 +4196,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (TC == 0) {
reportVectorizationFailure(
- "Unable to calculate the loop count due to complex control flow",
"unable to calculate the loop count due to complex control flow",
"UnknownLoopCountComplexCFG", ORE, TheLoop);
return FixedScalableVFPair::getNone();
@@ -4536,7 +4584,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
}
auto WillWiden = [&TTI, VF](Type *ScalarTy) {
- Type *VectorTy = ToVectorTy(ScalarTy, VF);
+ Type *VectorTy = toVectorTy(ScalarTy, VF);
unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
if (!NumLegalParts)
return false;
@@ -4673,6 +4721,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
+ // TODO: Add support for loops with an early exit.
if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
return false;
@@ -4921,6 +4970,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (!Legal->isSafeForAnyVectorWidth())
return 1;
+ // We don't attempt to perform interleaving for loops with uncountable early
+ // exits because the VPInstruction::AnyOf code cannot currently handle
+ // multiple parts.
+ if (Legal->hasUncountableEarlyExit())
+ return 1;
+
auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
const bool HasReductions = !Legal->getReductionVars().empty();
@@ -5105,8 +5160,9 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
HasReductions &&
any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
const RecurrenceDescriptor &RdxDesc = Reduction.second;
- return RecurrenceDescriptor::isAnyOfRecurrenceKind(
- RdxDesc.getRecurrenceKind());
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
});
if (HasSelectCmpReductions) {
LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
@@ -5519,7 +5575,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
- cast<VectorType>(ToVectorTy(I->getType(), VF)),
+ cast<VectorType>(toVectorTy(I->getType(), VF)),
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
/*Extract*/ false, CostKind);
ScalarCost +=
@@ -5538,7 +5594,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
Worklist.push_back(J);
else if (needsExtract(J, VF)) {
ScalarCost += TTI.getScalarizationOverhead(
- cast<VectorType>(ToVectorTy(J->getType(), VF)),
+ cast<VectorType>(toVectorTy(J->getType(), VF)),
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
/*Extract*/ true, CostKind);
}
@@ -5559,6 +5615,15 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
InstructionCost Cost;
+ // If the vector loop gets executed exactly once with the given VF, ignore the
+ // costs of comparison and induction instructions, as they'll get simplified
+ // away.
+ SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
+ auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
+ addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
+ ValuesToIgnoreForVF);
+
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
InstructionCost BlockCost;
@@ -5566,7 +5631,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
// For each instruction in the old loop.
for (Instruction &I : BB->instructionsWithoutDebug()) {
// Skip ignored values.
- if (ValuesToIgnore.count(&I) ||
+ if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
(VF.isVector() && VecValuesToIgnore.count(&I)))
continue;
@@ -5640,7 +5705,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
unsigned AS = getLoadStoreAddressSpace(I);
Value *Ptr = getLoadStorePointerOperand(I);
- Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+ Type *PtrTy = toVectorTy(Ptr->getType(), VF);
// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
// that it is being called from this specific place.
@@ -5691,7 +5756,7 @@ InstructionCost
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
ElementCount VF) {
Type *ValTy = getLoadStoreType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
@@ -5723,7 +5788,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
assert(Legal->isUniformMemOp(*I, VF));
Type *ValTy = getLoadStoreType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -5749,7 +5814,7 @@ InstructionCost
LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
ElementCount VF) {
Type *ValTy = getLoadStoreType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
const Value *Ptr = getLoadStorePointerOperand(I);
@@ -5767,7 +5832,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Instruction *InsertPos = Group->getInsertPos();
Type *ValTy = getLoadStoreType(InsertPos);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -6012,7 +6077,7 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
return 0;
InstructionCost Cost = 0;
- Type *RetTy = ToVectorTy(I->getType(), VF);
+ Type *RetTy = toVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(
@@ -6278,9 +6343,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
bool MaskRequired = Legal->isMaskRequired(CI);
// Compute corresponding vector type for return value and arguments.
- Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+ Type *RetTy = toVectorTy(ScalarRetTy, VF);
for (Type *ScalarTy : ScalarTys)
- Tys.push_back(ToVectorTy(ScalarTy, VF));
+ Tys.push_back(toVectorTy(ScalarTy, VF));
// An in-loop reduction using an fmuladd intrinsic is a special case;
// we don't want the normal cost for that intrinsic.
@@ -6470,7 +6535,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
HasSingleCopyAfterVectorization(I, VF));
VectorTy = RetTy;
} else
- VectorTy = ToVectorTy(RetTy, VF);
+ VectorTy = toVectorTy(RetTy, VF);
if (VF.isVector() && VectorTy->isVectorTy() &&
!TTI.getNumberOfParts(VectorTy))
@@ -6530,8 +6595,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return Switch->getNumCases() *
TTI.getCmpSelInstrCost(
Instruction::ICmp,
- ToVectorTy(Switch->getCondition()->getType(), VF),
- ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
+ toVectorTy(Switch->getCondition()->getType(), VF),
+ toVectorTy(Type::getInt1Ty(I->getContext()), VF),
CmpInst::ICMP_EQ, CostKind);
}
case Instruction::PHI: {
@@ -6576,8 +6641,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
return (Phi->getNumIncomingValues() - 1) *
TTI.getCmpSelInstrCost(
- Instruction::Select, ToVectorTy(ResultTy, VF),
- ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
+ Instruction::Select, toVectorTy(ResultTy, VF),
+ toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
CmpInst::BAD_ICMP_PREDICATE, CostKind);
}
@@ -6586,8 +6651,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (VF.isVector() && foldTailWithEVL() &&
Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
IntrinsicCostAttributes ICA(
- Intrinsic::vp_merge, ToVectorTy(Phi->getType(), VF),
- {ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
+ Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
+ {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
return TTI.getIntrinsicInstrCost(ICA, CostKind);
}
@@ -6727,7 +6792,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
}
- VectorTy = ToVectorTy(ValTy, VF);
+ VectorTy = toVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
cast<CmpInst>(I)->getPredicate(), CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
@@ -6745,7 +6810,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (Decision == CM_Scalarize)
Width = ElementCount::getFixed(1);
}
- VectorTy = ToVectorTy(getLoadStoreType(I), Width);
+ VectorTy = toVectorTy(getLoadStoreType(I), Width);
return getMemoryInstructionCost(I, VF);
}
case Instruction::BitCast:
@@ -6826,7 +6891,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
SrcScalarTy =
IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
Type *SrcVecTy =
- VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
+ VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
if (canTruncateToMinimalBitwidth(I, VF)) {
// If the result type is <= the source type, there will be no extend
@@ -7248,6 +7313,17 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
continue;
IVInsts.push_back(CI);
}
+
+ // If the vector loop gets executed exactly once with the given VF, ignore
+ // the costs of comparison and induction instructions, as they'll get
+ // simplified away.
+ // TODO: Remove this code after stepping away from the legacy cost model and
+ // adding code to simplify VPlans before calculating their costs.
+ auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
+ if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
+ addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
+ CostCtx.SkipCostComputation);
+
for (Instruction *IVInst : IVInsts) {
if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
continue;
@@ -7344,7 +7420,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
// Pre-compute the cost for I, if it has a reduction pattern cost.
for (Instruction *I : ChainOpsAndOperands) {
auto ReductionCost = CM.getReductionPatternCost(
- I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+ I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
if (!ReductionCost)
continue;
@@ -7584,7 +7660,8 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
// fix the reduction's scalar PHI node by adding the incoming value from the
// main vector loop.
static void fixReductionScalarResumeWhenVectorizingEpilog(
- VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock) {
+ VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
+ BasicBlock *BypassBlock) {
auto *EpiRedResult = dyn_cast<VPInstruction>(R);
if (!EpiRedResult ||
EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
@@ -7621,21 +7698,8 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
auto *EpiResumePhiVPI =
cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
- BasicBlock *LoopScalarPreHeader = EpiResumePhi->getParent();
- bool Updated = false;
- for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
- if (is_contained(MainResumePhi->blocks(), Incoming)) {
- assert(EpiResumePhi->getIncomingValueForBlock(Incoming) ==
- RdxDesc.getRecurrenceStartValue() &&
- "Trying to reset unexpected value");
- assert(!Updated && "Should update at most 1 incoming value");
- EpiResumePhi->setIncomingValueForBlock(
- Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
- Updated = true;
- }
- }
- assert(Updated && "Must update EpiResumePhi.");
- (void)Updated;
+ EpiResumePhi->setIncomingValueForBlock(
+ BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
}
DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
@@ -7656,23 +7720,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::unrollByUF(BestVPlan, BestUF,
OrigLoop->getHeader()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
-
- LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
- << ", UF=" << BestUF << '\n');
- BestVPlan.setName("Final VPlan");
- LLVM_DEBUG(BestVPlan.dump());
+ VPlanTransforms::convertToConcreteRecipes(BestVPlan);
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
- &BestVPlan);
+ &BestVPlan, OrigLoop->getParentLoop(),
+ Legal->getWidestInductionType());
+
+#ifdef EXPENSIVE_CHECKS
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+#endif
+
+ // 0. Generate SCEV-dependent code in the entry, including TripCount, before
+ // making any changes to the CFG.
+ if (!BestVPlan.getEntry()->empty())
+ BestVPlan.getEntry()->execute(&State);
- // 0. Generate SCEV-dependent code into the preheader, including TripCount,
- // before making any changes to the CFG.
- if (!BestVPlan.getPreheader()->empty()) {
- State.CFG.PrevBB = OrigLoop->getLoopPreheader();
- State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
- BestVPlan.getPreheader()->execute(&State);
- }
if (!ILV.getTripCount())
ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
else
@@ -7681,13 +7744,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
- Value *CanonicalIVStartValue;
- std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
- ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
- : State.ExpandedSCEVs);
-#ifdef EXPENSIVE_CHECKS
- assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-#endif
+ State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
+ ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
+ if (VectorizingEpilogue)
+ VPlanTransforms::removeDeadRecipes(BestVPlan);
// Only use noalias metadata when using memory checks guaranteeing no overlap
// across all iterations.
@@ -7718,20 +7778,31 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- BestVPlan.prepareToExecute(ILV.getTripCount(),
- ILV.getOrCreateVectorTripCount(nullptr),
- CanonicalIVStartValue, State);
- VPlanTransforms::prepareToExecute(BestVPlan);
+ BestVPlan.prepareToExecute(
+ ILV.getTripCount(),
+ ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
+ replaceVPBBWithIRVPBB(BestVPlan.getVectorPreheader(), State.CFG.PrevBB);
BestVPlan.execute(&State);
- // 2.5 Collect reduction resume values.
auto *ExitVPBB = BestVPlan.getMiddleBlock();
- if (VectorizingEpilogue)
+ // 2.5 When vectorizing the epilogue, fix reduction and induction resume
+ // values from the additional bypass block.
+ if (VectorizingEpilogue) {
+ assert(!ILV.Legal->hasUncountableEarlyExit() &&
+ "Epilogue vectorisation not yet supported with early exits");
+ BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
for (VPRecipeBase &R : *ExitVPBB) {
fixReductionScalarResumeWhenVectorizingEpilog(
- &R, State, State.CFG.VPBB2IRBB[ExitVPBB]);
+ &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock);
+ }
+ BasicBlock *PH = OrigLoop->getLoopPreheader();
+ for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+ Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
+ Inc->setIncomingValueForBlock(BypassBlock, V);
}
+ }
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
@@ -7758,7 +7829,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
}
TargetTransformInfo::UnrollingPreferences UP;
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
- if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
+ if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
addRuntimeUnrollDisableMetaData(L);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
@@ -7788,8 +7859,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-std::pair<BasicBlock *, Value *>
-EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
+BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) {
createVectorLoopSkeleton("");
@@ -7820,12 +7890,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
// Generate the induction variable.
EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
- // Skip induction resume value creation here because they will be created in
- // the second pass for the scalar loop. The induction resume values for the
- // inductions in the epilogue loop are created before executing the plan for
- // the epilogue loop.
-
- return {LoopVectorPreHeader, nullptr};
+ return LoopVectorPreHeader;
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7880,8 +7945,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass");
- // Update dominator for Bypass.
- DT->changeImmediateDominator(Bypass, TCCheckBlock);
LoopBypassBlocks.push_back(TCCheckBlock);
// Save the trip count so we don't have to regenerate it in the
@@ -7896,6 +7959,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
+ introduceCheckBlockInVPlan(TCCheckBlock);
return TCCheckBlock;
}
@@ -7905,7 +7969,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-std::pair<BasicBlock *, Value *>
+BasicBlock *
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
const SCEV2ValueTy &ExpandedSCEVs) {
createVectorLoopSkeleton("vec.epilog.");
@@ -7918,6 +7982,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
nullptr, "vec.epilog.iter.check", true);
emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
VecEpilogueIterationCountCheck);
+ AdditionalBypassBlock = VecEpilogueIterationCountCheck;
// Adjust the control flow taking the state info from the main loop
// vectorization into account.
@@ -7926,9 +7991,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopVectorPreHeader);
- DT->changeImmediateDominator(LoopVectorPreHeader,
- EPI.MainLoopIterationCountCheck);
-
EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
@@ -7939,19 +8001,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
- DT->changeImmediateDominator(
- VecEpilogueIterationCountCheck,
- VecEpilogueIterationCountCheck->getSinglePredecessor());
-
DT->changeImmediateDominator(LoopScalarPreHeader,
EPI.EpilogueIterationCountCheck);
- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
- // If there is an epilogue which must run, there's no edge from the
- // middle block to exit blocks and thus no need to update the immediate
- // dominator of the exit blocks.
- DT->changeImmediateDominator(OrigLoop->getUniqueLatchExitBlock(),
- EPI.EpilogueIterationCountCheck);
-
// Keep track of bypass blocks, as they feed start values to the induction and
// reduction phis in the scalar loop preheader.
if (EPI.SCEVSafetyCheck)
@@ -7988,27 +8039,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
Phi->removeIncomingValue(EPI.MemSafetyCheck);
}
- // Generate a resume induction for the vector epilogue and put it in the
- // vector epilogue preheader
- Type *IdxTy = Legal->getWidestInductionType();
- PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
- EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
- EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
- EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
- EPI.MainLoopIterationCountCheck);
-
- // Generate induction resume values. These variables save the new starting
- // indexes for the scalar loop. They are used to test if there are any tail
- // iterations left once the vector loop has completed.
- // Note that when the vectorized epilogue is skipped due to iteration count
- // check, then the resume value for the induction variable comes from
- // the trip count of the main vector loop, hence passing the AdditionalBypass
- // argument.
- createInductionResumeValues(ExpandedSCEVs,
- {VecEpilogueIterationCountCheck,
- EPI.VectorTripCount} /* AdditionalBypass */);
-
- return {LoopVectorPreHeader, EPResumeVal};
+ // Generate bypass values from the additional bypass block. Note that when the
+ // vectorized epilogue is skipped due to iteration count check, then the
+ // resume value for the induction variable comes from the trip count of the
+ // main vector loop, passed as the second argument.
+ createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
+ return LoopVectorPreHeader;
}
BasicBlock *
@@ -8054,6 +8090,16 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
}
ReplaceInstWithInst(Insert->getTerminator(), &BI);
LoopBypassBlocks.push_back(Insert);
+
+ // A new entry block has been created for the epilogue VPlan. Hook it in, as
+ // otherwise we would try to modify the entry to the main vector loop.
+ VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
+ VPBasicBlock *OldEntry = Plan.getEntry();
+ VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
+ Plan.setEntry(NewEntry);
+ // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
+
+ introduceCheckBlockInVPlan(Insert);
return Insert;
}
@@ -8160,8 +8206,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
// If source is an exiting block, we know the exit edge is dynamically dead
// in the vector loop, and thus we don't need to restrict the mask. Avoid
- // adding uses of an otherwise potentially dead instruction.
- if (OrigLoop->isLoopExiting(Src))
+ // adding uses of an otherwise potentially dead instruction unless we are
+ // vectorizing a loop with uncountable exits. In that case, we always
+ // materialize the mask.
+ if (OrigLoop->isLoopExiting(Src) &&
+ Src != Legal->getUncountableEarlyExitingBlock())
return EdgeMaskCache[Edge] = SrcMask;
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8297,10 +8346,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
if (Reverse)
VectorPtr = new VPReverseVectorPointerRecipe(
Ptr, &Plan.getVF(), getLoadStoreType(I),
- GEP ? GEP->isInBounds() : false, I->getDebugLoc());
+ GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
+ : GEPNoWrapFlags::none(),
+ I->getDebugLoc());
else
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
- GEP ? GEP->isInBounds() : false,
+ GEP ? GEP->getNoWrapFlags()
+ : GEPNoWrapFlags::none(),
I->getDebugLoc());
Builder.getInsertBlock()->appendRecipe(VectorPtr);
Ptr = VectorPtr;
@@ -8329,11 +8381,12 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc, TruncI);
+ IndDesc, TruncI,
+ TruncI->getDebugLoc());
}
assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc);
+ IndDesc, Phi->getDebugLoc());
}
VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
@@ -8355,7 +8408,8 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
[&](ElementCount VF) {
return CM.isScalarAfterVectorization(Phi, VF);
},
- Range));
+ Range),
+ Phi->getDebugLoc());
}
return nullptr;
}
@@ -8809,14 +8863,55 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
}
-/// Create resume phis in the scalar preheader for first-order recurrences and
-/// reductions and update the VPIRInstructions wrapping the original phis in the
-/// scalar header.
+/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
+/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
+/// the end value of the induction.
+static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
+ VPBuilder &VectorPHBuilder,
+ VPBuilder &ScalarPHBuilder,
+ VPTypeAnalysis &TypeInfo,
+ VPValue *VectorTC) {
+ auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+ // Truncated wide inductions resume from the last lane of their vector value
+ // in the last vector iteration which is handled elsewhere.
+ if (WideIntOrFp && WideIntOrFp->getTruncInst())
+ return nullptr;
+
+ VPValue *Start = WideIV->getStartValue();
+ VPValue *Step = WideIV->getStepValue();
+ const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+ VPValue *EndValue = VectorTC;
+ if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+ EndValue = VectorPHBuilder.createDerivedIV(
+ ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
+ Start, VectorTC, Step);
+ }
+
+ // EndValue is derived from the vector trip count (which has the same type as
+ // the widest induction) and thus may be wider than the induction here.
+ Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
+ if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
+ EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
+ ScalarTypeOfWideIV);
+ }
+
+ auto *ResumePhiRecipe =
+ ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
+ WideIV->getDebugLoc(), "bc.resume.val");
+ return ResumePhiRecipe;
+}
+
+/// Create resume phis in the scalar preheader for first-order recurrences,
+/// reductions and inductions, and update the VPIRInstructions wrapping the
+/// original phis in the scalar header.
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
- VPBuilder ScalarPHBuilder(ScalarPH);
+ VPBuilder VectorPHBuilder(
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+ VPBuilder ScalarPHBuilder(ScalarPH);
VPValue *OneVPV = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
@@ -8824,9 +8919,23 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
if (!ScalarPhiI)
break;
+
auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
- if (!isa<VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe>(VectorPhiR))
+ if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
+ if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
+ WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
+ &Plan.getVectorTripCount())) {
+ ScalarPhiIRI->addOperand(ResumePhi);
+ continue;
+ }
+ // TODO: Also handle truncated inductions here. Computing end-values
+ // separately should be done as VPlan-to-VPlan optimization, after
+ // legalizing all resume values to use the last lane from the loop.
+ assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
+ "should only skip truncated wide inductions");
continue;
+ }
+
// The backedge value provides the value to resume coming out of a loop,
// which for FORs is a vector whose last element needs to be extracted. The
// start value provides the value if the loop is bypassed.
@@ -8852,14 +8961,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+ auto *MiddleVPBB = Plan.getMiddleBlock();
SetVector<VPIRInstruction *> ExitUsersToFix;
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
- BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
- BasicBlock *ExitingBB = find_singleton<BasicBlock>(
- to_vector(predecessors(ExitBB)),
- [OrigLoop](BasicBlock *Pred, bool AllowRepeats) {
- return OrigLoop->contains(Pred) ? Pred : nullptr;
- });
for (VPRecipeBase &R : *ExitVPBB) {
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
if (!ExitIRI)
@@ -8867,35 +8971,48 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
if (!ExitPhi)
break;
- Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
- VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
- // Exit values for inductions are computed and updated outside of VPlan
- // and independent of induction recipes.
- // TODO: Compute induction exit values in VPlan.
- if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
- !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
- isa<VPWidenPointerInductionRecipe>(V) ||
- (isa<Instruction>(IncomingValue) &&
- OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
- any_of(IncomingValue->users(), [&Inductions](User *U) {
- auto *P = dyn_cast<PHINode>(U);
- return P && Inductions.contains(P);
- })))
- continue;
- ExitUsersToFix.insert(ExitIRI);
- ExitIRI->addOperand(V);
+ for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
+ BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
+ if (PredVPBB != MiddleVPBB) {
+ SmallVector<BasicBlock *> ExitingBlocks;
+ OrigLoop->getExitingBlocks(ExitingBlocks);
+ assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
+ ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
+ : ExitingBlocks[0];
+ }
+ Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+ VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
+ // Exit values for inductions are computed and updated outside of VPlan
+ // and independent of induction recipes.
+ // TODO: Compute induction exit values in VPlan.
+ if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+ !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+ isa<VPWidenPointerInductionRecipe>(V) ||
+ (isa<Instruction>(IncomingValue) &&
+ OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
+ any_of(IncomingValue->users(), [&Inductions](User *U) {
+ auto *P = dyn_cast<PHINode>(U);
+ return P && Inductions.contains(P);
+ }))) {
+ if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
+ continue;
+ }
+ ExitUsersToFix.insert(ExitIRI);
+ ExitIRI->addOperand(V);
+ }
}
}
return ExitUsersToFix;
}
// Add exit values to \p Plan. Extracts are added for each entry in \p
-// ExitUsersToFix if needed and their operands are updated.
-static void
+// ExitUsersToFix if needed and their operands are updated. Returns true if all
+// exit users can be handled, otherwise return false.
+static bool
addUsersInExitBlocks(VPlan &Plan,
const SetVector<VPIRInstruction *> &ExitUsersToFix) {
if (ExitUsersToFix.empty())
- return;
+ return true;
auto *MiddleVPBB = Plan.getMiddleBlock();
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
@@ -8903,20 +9020,25 @@ addUsersInExitBlocks(VPlan &Plan,
// Introduce extract for exiting values and update the VPIRInstructions
// modeling the corresponding LCSSA phis.
for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
- VPValue *V = ExitIRI->getOperand(0);
- // Pass live-in values used by exit phis directly through to their users in
- // the exit block.
- if (V->isLiveIn())
- continue;
+ for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
+ // Pass live-in values used by exit phis directly through to their users
+ // in the exit block.
+ if (Op->isLiveIn())
+ continue;
+
+ // Currently only live-ins can be used by exit values from blocks not
+ // exiting via the vector latch through to the middle block.
+ if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
+ return false;
- assert(ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
- "Exit value not handled yet for this edge.");
- LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
- VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
- {V, Plan.getOrAddLiveIn(ConstantInt::get(
- IntegerType::get(Ctx, 32), 1))});
- ExitIRI->setOperand(0, Ext);
+ LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
+ VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
+ {Op, Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(Ctx, 32), 1))});
+ ExitIRI->setOperand(Idx, Ext);
+ }
}
+ return true;
}
/// Handle users in the exit block for first order reductions in the original
@@ -9176,7 +9298,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->appendRecipe(Recipe);
}
- VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+ VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
@@ -9189,11 +9311,22 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
+ if (auto *UncountableExitingBlock =
+ Legal->getUncountableEarlyExitingBlock()) {
+ VPlanTransforms::handleUncountableEarlyExit(
+ *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
+ }
addScalarResumePhis(RecipeBuilder, *Plan);
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
- addUsersInExitBlocks(*Plan, ExitUsersToFix);
+ if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
+ reportVectorizationFailure(
+ "Some exit values in loop with uncountable exit not supported yet",
+ "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
+ return nullptr;
+ }
+
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
@@ -9304,6 +9437,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
bool HasNUW = true;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
DebugLoc());
+
+ // Collect mapping of IR header phis to header phi recipes, to be used in
+ // addScalarResumePhis.
+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+ for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ if (isa<VPCanonicalIVPHIRecipe>(&R))
+ continue;
+ auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
+ RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
+ }
+ addScalarResumePhis(RecipeBuilder, *Plan);
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
@@ -9334,8 +9479,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
RecurKind Kind = RdxDesc.getRecurrenceKind();
- assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
- "AnyOf reductions are not allowed for in-loop reductions");
+ assert(
+ !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
+ !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
+ "AnyOf and FindLast reductions are not allowed for in-loop reductions");
// Collect the chain of "link" recipes for the reduction starting at PhiR.
SetVector<VPSingleDefRecipe *> Worklist;
@@ -9439,9 +9586,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (CM.blockNeedsPredicationForAnyReason(BB))
CondOp = RecipeBuilder.getBlockInMask(BB);
- VPReductionRecipe *RedRecipe =
- new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
- CondOp, CM.useOrderedReductions(RdxDesc));
+ auto *RedRecipe = new VPReductionRecipe(
+ RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
+ CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
// Append the recipe to the end of the VPBasicBlock because we need to
// ensure that it comes after all of it's inputs, including CondOp.
// Note that this transformation may leave over dead recipes (including
@@ -9566,6 +9713,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// Convert the reduction phi to operate on bools.
PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
OrigLoop->getHeader()->getContext())));
+ continue;
+ }
+
+ if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ // Adjust the start value for FindLastIV recurrences to use the sentinel
+ // value after generating the ResumePhi recipe, which uses the original
+ // start value.
+ PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
}
}
@@ -9581,13 +9737,18 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
Value *Step = State.get(getStepValue(), VPLane(0));
- Value *CanonicalIV = State.get(getOperand(1), VPLane(0));
+ Value *Index = State.get(getOperand(1), VPLane(0));
Value *DerivedIV = emitTransformedIndex(
- State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
- Kind, cast_if_present<BinaryOperator>(FPBinOp));
+ State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
+ cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName(Name);
- assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
-
+ // If index is the vector trip count, the concrete value will only be set in
+ // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
+ // TODO: Remove the special case for the vector trip count once it is computed
+ // in VPlan and can be used during VPlan simplification.
+ assert((DerivedIV != Index ||
+ getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
+ "IV didn't need transforming?");
State.set(this, DerivedIV, VPLane(0));
}
@@ -9897,6 +10058,164 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
!EnableLoopVectorization) {}
+/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
+/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
+/// don't have a corresponding wide induction in \p EpiPlan.
+static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
+ // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
+ // will need their resume-values computed in the main vector loop. Others
+ // can be removed from the main VPlan.
+ SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
+ for (VPRecipeBase &R :
+ EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ if (isa<VPCanonicalIVPHIRecipe>(&R))
+ continue;
+ EpiWidenedPhis.insert(
+ cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
+ }
+ for (VPRecipeBase &R : make_early_inc_range(
+ *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
+ auto *VPIRInst = cast<VPIRInstruction>(&R);
+ auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
+ if (!IRI)
+ break;
+ if (EpiWidenedPhis.contains(IRI))
+ continue;
+ // There is no corresponding wide induction in the epilogue plan that would
+ // need a resume value. Remove the VPIRInst wrapping the scalar header phi
+ // together with the corresponding ResumePhi. The resume values for the
+ // scalar loop will be created during execution of EpiPlan.
+ VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
+ VPIRInst->eraseFromParent();
+ ResumePhi->eraseFromParent();
+ }
+ VPlanTransforms::removeDeadRecipes(MainPlan);
+
+ using namespace VPlanPatternMatch;
+ VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
+ VPValue *VectorTC = &MainPlan.getVectorTripCount();
+ // If there is a suitable resume value for the canonical induction in the
+ // scalar (which will become vector) epilogue loop we are done. Otherwise
+ // create it below.
+ if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
+ return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
+ m_Specific(VectorTC), m_SpecificInt(0)));
+ }))
+ return;
+ VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
+ ScalarPHBuilder.createNaryOp(
+ VPInstruction::ResumePhi,
+ {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
+ "vec.epilog.resume.val");
+}
+
+/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
+/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
+static void
+preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
+ const SCEV2ValueTy &ExpandedSCEVs,
+ const EpilogueLoopVectorizationInfo &EPI) {
+ VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
+ VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
+ Header->setName("vec.epilog.vector.body");
+
+ // Re-use the trip count and steps expanded for the main loop, as
+ // skeleton creation needs it as a value that dominates both the scalar
+ // and vector epilogue loops
+ // TODO: This is a workaround needed for epilogue vectorization and it
+ // should be removed once induction resume value creation is done
+ // directly in VPlan.
+ for (auto &R : make_early_inc_range(*Plan.getEntry())) {
+ auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
+ if (!ExpandR)
+ continue;
+ auto *ExpandedVal =
+ Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
+ ExpandR->replaceAllUsesWith(ExpandedVal);
+ if (Plan.getTripCount() == ExpandR)
+ Plan.resetTripCount(ExpandedVal);
+ ExpandR->eraseFromParent();
+ }
+
+ // Ensure that the start values for all header phi recipes are updated before
+ // vectorizing the epilogue loop.
+ for (VPRecipeBase &R : Header->phis()) {
+ if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
+ // When vectorizing the epilogue loop, the canonical induction start
+ // value needs to be changed from zero to the value after the main
+ // vector loop. Find the resume value created during execution of the main
+ // VPlan.
+ // FIXME: Improve modeling for canonical IV start values in the epilogue
+ // loop.
+ BasicBlock *MainMiddle = find_singleton<BasicBlock>(
+ predecessors(L->getLoopPreheader()),
+ [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
+ if (BB != EPI.MainLoopIterationCountCheck &&
+ BB != EPI.EpilogueIterationCountCheck &&
+ BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
+ return BB;
+ return nullptr;
+ });
+ using namespace llvm::PatternMatch;
+ Type *IdxTy = IV->getScalarType();
+ PHINode *EPResumeVal = find_singleton<PHINode>(
+ L->getLoopPreheader()->phis(),
+ [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
+ if (P.getType() == IdxTy &&
+ P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
+ match(
+ P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
+ m_SpecificInt(0)))
+ return &P;
+ return nullptr;
+ });
+ assert(EPResumeVal && "must have a resume value for the canonical IV");
+ VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
+ assert(all_of(IV->users(),
+ [](const VPUser *U) {
+ return isa<VPScalarIVStepsRecipe>(U) ||
+ isa<VPScalarCastRecipe>(U) ||
+ isa<VPDerivedIVRecipe>(U) ||
+ cast<VPInstruction>(U)->getOpcode() ==
+ Instruction::Add;
+ }) &&
+ "the canonical IV should only be used by its increment or "
+ "ScalarIVSteps when resetting the start value");
+ IV->setOperand(0, VPV);
+ continue;
+ }
+
+ Value *ResumeV = nullptr;
+ // TODO: Move setting of resume values to prepareToExecute.
+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+ ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
+ ->getIncomingValueForBlock(L->getLoopPreheader());
+ const RecurrenceDescriptor &RdxDesc =
+ ReductionPhi->getRecurrenceDescriptor();
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+ // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
+ // start value; compare the final value from the main vector loop
+ // to the start value.
+ IRBuilder<> Builder(
+ cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
+ ResumeV =
+ Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
+ }
+ } else {
+ // Retrieve the induction resume values for wide inductions from
+ // their original phi nodes in the scalar loop.
+ PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
+ // Hook up to the PHINode generated by a ResumePhi recipe of main
+ // loop VPlan, which feeds the scalar loop.
+ ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
+ }
+ assert(ResumeV && "Must have a resume value");
+ VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
+ cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
+ }
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->isInnermost()) &&
"VPlan-native path is not enabled. Only process inner loops.");
@@ -9946,12 +10265,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (LVL.hasUncountableEarlyExit()) {
+ if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
- "early exit is not yet supported",
- "Auto-vectorization of loops with uncountable "
- "early exit is not yet supported",
- "UncountableEarlyExitLoopsUnsupported", ORE, L);
+ "early exit is not enabled",
+ "UncountableEarlyExitLoopsDisabled", ORE, L);
return false;
}
@@ -9977,6 +10294,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (UseInterleaved)
IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+ if (LVL.hasUncountableEarlyExit()) {
+ BasicBlock *LoopLatch = L->getLoopLatch();
+ if (IAI.requiresScalarEpilogue() ||
+ any_of(LVL.getCountableExitingBlocks(),
+ [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
+ reportVectorizationFailure("Auto-vectorization of early exit loops "
+ "requiring a scalar epilogue is unsupported",
+ "UncountableEarlyExitUnsupported", ORE, L);
+ return false;
+ }
+ }
+
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
ScalarEpilogueLowering SEL =
@@ -10243,11 +10572,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// The first pass vectorizes the main loop and creates a scalar epilogue
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
- EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+ VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+ preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
+ EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
+ BestEpiPlan);
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI, &LVL, &CM, BFI, PSI, Checks,
*BestMainPlan);
-
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, false);
++LoopsVectorized;
@@ -10256,84 +10587,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
EPI.MainLoopUF = EPI.EpilogueUF;
- VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
Checks, BestEpiPlan);
-
- VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
- VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
- Header->setName("vec.epilog.vector.body");
-
- // Re-use the trip count and steps expanded for the main loop, as
- // skeleton creation needs it as a value that dominates both the scalar
- // and vector epilogue loops
- // TODO: This is a workaround needed for epilogue vectorization and it
- // should be removed once induction resume value creation is done
- // directly in VPlan.
EpilogILV.setTripCount(MainILV.getTripCount());
- for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
- auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
- if (!ExpandR)
- continue;
- auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
- ExpandedSCEVs.find(ExpandR->getSCEV())->second);
- ExpandR->replaceAllUsesWith(ExpandedVal);
- if (BestEpiPlan.getTripCount() == ExpandR)
- BestEpiPlan.resetTripCount(ExpandedVal);
- ExpandR->eraseFromParent();
- }
-
- // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
- // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
- // before vectorizing the epilogue loop.
- for (VPRecipeBase &R : Header->phis()) {
- if (isa<VPCanonicalIVPHIRecipe>(&R))
- continue;
+ preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
- Value *ResumeV = nullptr;
- // TODO: Move setting of resume values to prepareToExecute.
- if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
- ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
- ->getIncomingValueForBlock(L->getLoopPreheader());
- const RecurrenceDescriptor &RdxDesc =
- ReductionPhi->getRecurrenceDescriptor();
- RecurKind RK = RdxDesc.getRecurrenceKind();
- if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
- // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
- // start value; compare the final value from the main vector loop
- // to the start value.
- IRBuilder<> Builder(
- cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
- ResumeV = Builder.CreateICmpNE(ResumeV,
- RdxDesc.getRecurrenceStartValue());
- }
- } else {
- // Create induction resume values for both widened pointer and
- // integer/fp inductions and update the start value of the induction
- // recipes to use the resume value.
- PHINode *IndPhi = nullptr;
- const InductionDescriptor *ID;
- if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
- IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
- ID = &Ind->getInductionDescriptor();
- } else {
- auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
- IndPhi = WidenInd->getPHINode();
- ID = &WidenInd->getInductionDescriptor();
- }
-
- ResumeV = MainILV.createInductionResumeValue(
- IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
- {EPI.MainLoopIterationCountCheck});
- }
- assert(ResumeV && "Must have a resume value");
- VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
- cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
- }
-
- assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
- "DT not preserved correctly");
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
@@ -10361,6 +10620,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
checkMixedPrecision(L, ORE);
}
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
+ "DT not preserved correctly");
+
std::optional<MDNode *> RemainderLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupEpilogue});
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 33657c26356d..f52ddfda5e64 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -344,6 +344,8 @@ static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
unsigned SVNumElements =
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
unsigned ShuffleMaskSize = SV->getShuffleMask().size();
+ if (SVNumElements % ShuffleMaskSize != 0)
+ return 0;
unsigned GroupSize = SVNumElements / ShuffleMaskSize;
if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
return 0;
@@ -514,7 +516,7 @@ static bool isCommutative(Instruction *I) {
BO->uses(),
[](const Use &U) {
// Commutative, if icmp eq/ne sub, 0
- ICmpInst::Predicate Pred;
+ CmpPredicate Pred;
if (match(U.getUser(),
m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
(Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
@@ -700,7 +702,8 @@ static SmallBitVector isUndefVector(const Value *V,
/// TODO: Can we split off and reuse the shuffle mask detection from
/// ShuffleVectorInst/getShuffleCost?
static std::optional<TargetTransformInfo::ShuffleKind>
-isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
+isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+ AssumptionCache *AC) {
const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
if (It == VL.end())
return std::nullopt;
@@ -717,14 +720,14 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
- bool HasNonUndefVec = any_of(VL, [](Value *V) {
+ bool HasNonUndefVec = any_of(VL, [&](Value *V) {
auto *EE = dyn_cast<ExtractElementInst>(V);
if (!EE)
return false;
Value *Vec = EE->getVectorOperand();
if (isa<UndefValue>(Vec))
return false;
- return isGuaranteedNotToBePoison(Vec);
+ return isGuaranteedNotToBePoison(Vec, AC);
});
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
@@ -807,14 +810,16 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {
namespace {
/// Main data required for vectorization of instructions.
-struct InstructionsState {
- /// The very first instruction in the list with the main opcode.
- Value *OpValue = nullptr;
-
- /// The main/alternate instruction.
+class InstructionsState {
+ /// The main/alternate instruction. MainOp is also VL0.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
+public:
+ Instruction *getMainOp() const { return MainOp; }
+
+ Instruction *getAltOp() const { return AltOp; }
+
/// The main/alternate opcodes for the list of instructions.
unsigned getOpcode() const {
return MainOp ? MainOp->getOpcode() : 0;
@@ -833,9 +838,9 @@ struct InstructionsState {
}
InstructionsState() = delete;
- InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
- : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
- static InstructionsState invalid() { return {nullptr, nullptr, nullptr}; }
+ InstructionsState(Instruction *MainOp, Instruction *AltOp)
+ : MainOp(MainOp), AltOp(AltOp) {}
+ static InstructionsState invalid() { return {nullptr, nullptr}; }
};
} // end anonymous namespace
@@ -1073,7 +1078,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState::invalid();
}
- return InstructionsState(V, cast<Instruction>(V),
+ return InstructionsState(cast<Instruction>(V),
cast<Instruction>(VL[AltIndex]));
}
@@ -1087,7 +1092,8 @@ static bool allSameType(ArrayRef<Value *> VL) {
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
- TargetLibraryInfo *TLI) {
+ TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI) {
if (!UserInst)
return false;
unsigned Opcode = UserInst->getOpcode();
@@ -1104,7 +1110,7 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
return any_of(enumerate(CI->args()), [&](auto &&Arg) {
- return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
+ return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
Arg.value().get() == Scalar;
});
}
@@ -1842,12 +1848,12 @@ public:
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
if (S.getOpcode() &&
- (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
+ (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
!S.isAltShuffle()) &&
all_of(Ops, [&S](Value *V) {
return isa<PoisonValue>(V) ||
cast<Instruction>(V)->getNumOperands() ==
- S.MainOp->getNumOperands();
+ S.getMainOp()->getNumOperands();
}))
return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
: LookAheadHeuristics::ScoreSameOpcode;
@@ -2017,6 +2023,9 @@ public:
/// A vector of operand vectors.
SmallVector<OperandDataVec, 4> OpsVec;
+ /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
+ /// is not IntrinsicInst, ArgSize is User::getNumOperands.
+ unsigned ArgSize = 0;
const TargetLibraryInfo &TLI;
const DataLayout &DL;
@@ -2400,14 +2409,15 @@ public:
}
/// Go through the instructions in VL and append their operands.
- void appendOperandsOfVL(ArrayRef<Value *> VL) {
+ void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
assert(!VL.empty() && "Bad VL");
assert((empty() || VL.size() == getNumLanes()) &&
"Expected same number of lanes");
+ // IntrinsicInst::isCommutative returns true if swapping the first "two"
+ // arguments to the intrinsic produces the same result.
constexpr unsigned IntrinsicNumOperands = 2;
- auto *VL0 = cast<Instruction>(*find_if(VL, IsaPred<Instruction>));
- unsigned NumOperands = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands
- : VL0->getNumOperands();
+ unsigned NumOperands = VL0->getNumOperands();
+ ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2440,7 +2450,7 @@ public:
}
/// \returns the number of operands.
- unsigned getNumOperands() const { return OpsVec.size(); }
+ unsigned getNumOperands() const { return ArgSize; }
/// \returns the number of lanes.
unsigned getNumLanes() const { return OpsVec[0].size(); }
@@ -2460,6 +2470,8 @@ public:
/// the whole vector (it is mixed with constants or loop invariant values).
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+ assert(Op == getValue(OpIdx, Lane) &&
+ "Op is expected to be getValue(OpIdx, Lane).");
// Small number of loads - try load matching.
if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
return false;
@@ -2517,6 +2529,8 @@ public:
/// Checks if there is at least single compatible operand in lanes other
/// than \p Lane, compatible with the operand \p Op.
bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
+ assert(Op == getValue(OpIdx, Lane) &&
+ "Op is expected to be getValue(OpIdx, Lane).");
bool OpAPO = getData(OpIdx, Lane).APO;
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
if (Ln == Lane)
@@ -2537,13 +2551,11 @@ public:
public:
/// Initialize with all the operands of the instruction vector \p RootVL.
- VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
+ VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
: TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
- L(R.LI->getLoopFor(
- (cast<Instruction>(*find_if(RootVL, IsaPred<Instruction>))
- ->getParent()))) {
+ L(R.LI->getLoopFor((VL0->getParent()))) {
// Append all the operands of RootVL.
- appendOperandsOfVL(RootVL);
+ appendOperandsOfVL(RootVL, VL0);
}
/// \Returns a value vector with the operands across all lanes for the
@@ -2617,7 +2629,8 @@ public:
ArrayRef<OperandData> Op0 = OpsVec.front();
for (const OperandData &Data : Op0)
UniqueValues.insert(Data.V);
- for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
+ for (ArrayRef<OperandData> Op :
+ ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
if (any_of(Op, [&UniqueValues](const OperandData &Data) {
return !UniqueValues.contains(Data.V);
}))
@@ -2920,13 +2933,11 @@ private:
/// truncation. We collect the entries that will be demoted in ToDemote.
/// \param E Node for analysis
/// \param ToDemote indices of the nodes to be demoted.
- bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
- unsigned &BitWidth,
- SmallVectorImpl<unsigned> &ToDemote,
- DenseSet<const TreeEntry *> &Visited,
- unsigned &MaxDepthLevel,
- bool &IsProfitableToDemote,
- bool IsTruncRoot) const;
+ bool collectValuesToDemote(
+ const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+ SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
+ const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
+ bool &IsProfitableToDemote, bool IsTruncRoot) const;
/// Check if the operands on the edges \p Edges of the \p UserTE allows
/// reordering (i.e. the operands can be reordered because they have only one
@@ -3138,13 +3149,6 @@ private:
SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
8> &GatheredLoads);
- /// Reorder commutative or alt operands to get better probability of
- /// generating vectorized code.
- static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- const BoUpSLP &R);
-
/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
/// users of \p TE and collects the stores. It returns the map from the store
/// pointers to the collected stores.
@@ -3307,7 +3311,7 @@ private:
/// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
/// other nodes as a series of insertvector instructions.
- SmallVector<std::pair<unsigned, unsigned>, 0> CombinedEntriesWithIndices;
+ SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
private:
/// The operands of each instruction in each lane Operands[op_index][lane].
@@ -3339,27 +3343,13 @@ private:
copy(OpVL, Operands[OpIdx].begin());
}
- /// Set the operands of this bundle in their original order.
- void setOperandsInOrder() {
- assert(Operands.empty() && "Already initialized?");
- auto *I0 = cast<Instruction>(*find_if(Scalars, IsaPred<Instruction>));
- Operands.resize(I0->getNumOperands());
- unsigned NumLanes = Scalars.size();
- for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
- OpIdx != NumOperands; ++OpIdx) {
- Operands[OpIdx].resize(NumLanes);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- if (isa<PoisonValue>(Scalars[Lane])) {
- Operands[OpIdx][Lane] =
- PoisonValue::get(I0->getOperand(OpIdx)->getType());
- continue;
- }
- auto *I = cast<Instruction>(Scalars[Lane]);
- assert(I->getNumOperands() == NumOperands &&
- "Expected same number of operands");
- Operands[OpIdx][Lane] = I->getOperand(OpIdx);
- }
- }
+ /// Set this bundle's operand from Scalars.
+ void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
+ VLOperands Ops(Scalars, MainOp, R);
+ if (RequireReorder)
+ Ops.reorder();
+ for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
+ setOperand(I, Ops.getVL(I));
}
/// Reorders operands of the node to the given mask \p Mask.
@@ -3410,8 +3400,8 @@ private:
}
void setOperations(const InstructionsState &S) {
- MainOp = S.MainOp;
- AltOp = S.AltOp;
+ MainOp = S.getMainOp();
+ AltOp = S.getAltOp();
}
Instruction *getMainOp() const {
@@ -3555,6 +3545,13 @@ private:
for (const auto &EInfo : UserTreeIndices)
dbgs() << EInfo << ", ";
dbgs() << "\n";
+ if (!CombinedEntriesWithIndices.empty()) {
+ dbgs() << "Combined entries: ";
+ interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
+ dbgs() << "Entry index " << P.first << " with offset " << P.second;
+ });
+ dbgs() << "\n";
+ }
}
#endif
};
@@ -3649,8 +3646,8 @@ private:
}
// Update the scheduler bundle to point to this TreeEntry.
ScheduleData *BundleMember = *Bundle;
- assert((BundleMember || isa<PHINode>(S.MainOp) ||
- isVectorLikeInstWithConstOps(S.MainOp) ||
+ assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
+ isVectorLikeInstWithConstOps(S.getMainOp()) ||
doesNotNeedToSchedule(VL)) &&
"Bundle and VL out of sync");
if (BundleMember) {
@@ -3717,9 +3714,11 @@ private:
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
- TreeEntry::EntryState getScalarsVectorizationState(
- InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps);
+ TreeEntry::EntryState
+ getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
+ bool IsScatterVectorizeUserTE,
+ OrdersType &CurrentOrder,
+ SmallVectorImpl<Value *> &PointerOps);
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
@@ -4790,8 +4789,10 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) {
/// Check if \p Order represents reverse order.
static bool isReverseOrder(ArrayRef<unsigned> Order) {
+ assert(!Order.empty() &&
+ "Order is empty. Please check it before using isReverseOrder.");
unsigned Sz = Order.size();
- return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
+ return all_of(enumerate(Order), [&](const auto &Pair) {
return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
});
}
@@ -5642,8 +5643,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
auto PHICompare = [&](unsigned I1, unsigned I2) {
Value *V1 = TE.Scalars[I1];
Value *V2 = TE.Scalars[I2];
- if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0) ||
- isa<PoisonValue>(V1) || isa<PoisonValue>(V2))
+ if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
+ return false;
+ if (isa<PoisonValue>(V1))
+ return true;
+ if (isa<PoisonValue>(V2))
return false;
if (V1->getNumUses() < V2->getNumUses())
return true;
@@ -6511,7 +6515,7 @@ void BoUpSLP::buildExternalUses(
// be used.
if (UseEntry->State == TreeEntry::ScatterVectorize ||
!doesInTreeUserNeedToExtract(
- Scalar, getRootEntryInstruction(*UseEntry), TLI)) {
+ Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
assert(!UseEntry->isGather() && "Bad state");
@@ -6935,8 +6939,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// 2. All users are deleted.
// 3. The load broadcasts are not allowed or the load is not
// broadcasted.
- if (std::distance(LI->user_begin(), LI->user_end()) !=
- LI->getNumUses())
+ if (static_cast<unsigned int>(std::distance(
+ LI->user_begin(), LI->user_end())) != LI->getNumUses())
return false;
if (!IsLegalBroadcastLoad)
continue;
@@ -7426,17 +7430,17 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
unsigned Opcode1 = S.getAltOpcode();
SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
// If this pattern is supported by the target then consider it profitable.
- if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()),
+ if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
Opcode0, Opcode1, OpcodeMask))
return true;
SmallVector<ValueList> Operands;
- for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
+ for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
Operands.emplace_back();
// Prepare the operand vector.
for (Value *V : VL) {
if (isa<PoisonValue>(V)) {
Operands.back().push_back(
- PoisonValue::get(S.MainOp->getOperand(I)->getType()));
+ PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
continue;
}
Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
@@ -7486,7 +7490,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
++ExtraShuffleInsts;
}
}
- const Loop *L = LI->getLoopFor(S.MainOp->getParent());
+ const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
// Vectorize node, if:
// 1. at least single operand is constant or splat.
// 2. Operands have many loop invariants (the instructions are not loop
@@ -7496,7 +7500,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
[&](ArrayRef<Value *> Op) {
if (allConstant(Op) ||
(!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
- getSameOpcode(Op, *TLI).MainOp))
+ getSameOpcode(Op, *TLI).getMainOp()))
return false;
DenseMap<Value *, unsigned> Uniques;
for (Value *V : Op) {
@@ -7528,19 +7532,21 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
// vector operands is number of vector instructions + number of vector
// instructions for operands (buildvectors). Number of buildvector
// instructions is just number_of_operands * number_of_scalars.
- (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
+ (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
(UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
- NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
+ NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
}
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
- InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) {
- assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
+ const InstructionsState &S, ArrayRef<Value *> VL,
+ bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+ SmallVectorImpl<Value *> &PointerOps) {
+ assert(S.getMainOp() &&
+ "Expected instructions with same/alternate opcodes only.");
unsigned ShuffleOrOp =
S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
- auto *VL0 = cast<Instruction>(S.OpValue);
+ Instruction *VL0 = S.getMainOp();
switch (ShuffleOrOp) {
case Instruction::PHI: {
// Too many operands - gather, most probably won't be vectorized.
@@ -7712,7 +7718,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::Or:
case Instruction::Xor:
case Instruction::Freeze:
- if (S.MainOp->getType()->isFloatingPointTy() &&
+ if (S.getMainOp()->getType()->isFloatingPointTy() &&
TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
auto *I = dyn_cast<Instruction>(V);
return I && I->isBinaryOp() && !I->isFast();
@@ -7809,7 +7815,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::NeedToGather;
}
case Instruction::Call: {
- if (S.MainOp->getType()->isFloatingPointTy() &&
+ if (S.getMainOp()->getType()->isFloatingPointTy() &&
TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
auto *I = dyn_cast<Instruction>(V);
return I && !I->isFast();
@@ -7834,7 +7840,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
unsigned NumArgs = CI->arg_size();
SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned J = 0; J != NumArgs; ++J)
- if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI))
ScalarArgs[J] = CI->getArgOperand(J);
for (Value *V : VL) {
CallInst *CI2 = dyn_cast<CallInst>(V);
@@ -7850,7 +7856,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
// Some intrinsics have scalar arguments and should be same in order for
// them to be vectorized.
for (unsigned J = 0; J != NumArgs; ++J) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) {
Value *A1J = CI2->getArgOperand(J);
if (ScalarArgs[J] != A1J) {
LLVM_DEBUG(dbgs()
@@ -8035,7 +8041,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return isa<UndefValue>(V) || !isConstant(V);
}))) {
if (DoNotFail && UniquePositions.size() > 1 &&
- NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
+ NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
// Find the number of elements, which forms full vectors.
unsigned PWSz = getFullVectorNumberOfElements(
@@ -8065,8 +8071,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't go into catchswitch blocks, which can happen with PHIs.
// Such blocks can only have PHIs and the catchswitch. There is no
// place to insert a shuffle if we need to, so just avoid that issue.
- if (S.MainOp &&
- isa<CatchSwitchInst>(S.MainOp->getParent()->getTerminator())) {
+ if (S.getMainOp() &&
+ isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return;
@@ -8074,10 +8080,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check if this is a duplicate of another entry.
if (S.getOpcode()) {
- if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+ if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
+ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
+ << ".\n");
if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
- auto It = MultiNodeScalars.find(S.OpValue);
+ auto It = MultiNodeScalars.find(S.getMainOp());
if (It != MultiNodeScalars.end()) {
auto *TEIt = find_if(It->getSecond(),
[&](TreeEntry *ME) { return ME->isSame(VL); });
@@ -8090,7 +8097,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
if (!E) {
- if (!doesNotNeedToBeScheduled(S.OpValue)) {
+ if (!doesNotNeedToBeScheduled(S.getMainOp())) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
@@ -8098,8 +8105,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
SmallPtrSet<const TreeEntry *, 4> Nodes;
- Nodes.insert(getTreeEntry(S.OpValue));
- for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
+ Nodes.insert(getTreeEntry(S.getMainOp()));
+ for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
Nodes.insert(E);
SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
if (any_of(Nodes, [&](const TreeEntry *E) {
@@ -8122,7 +8129,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// used to properly draw the graph rather than for the actual
// vectorization.
E->UserTreeIndices.push_back(UserTreeIdx);
- LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
<< ".\n");
return;
}
@@ -8133,13 +8140,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// a load), in which case peek through to include it in the tree, without
// ballooning over-budget.
if (Depth >= RecursionMaxDepth &&
- !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
- VL.size() >= 4 &&
- (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
+ !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 &&
+ (match(S.getMainOp(), m_Load(m_Value())) ||
+ all_of(VL, [&S](const Value *I) {
return match(I,
m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
cast<Instruction>(I)->getOpcode() ==
- cast<Instruction>(S.MainOp)->getOpcode();
+ S.getMainOp()->getOpcode();
})))) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
if (TryToFindDuplicates(S))
@@ -8151,7 +8158,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Don't handle scalable vectors
if (S.getOpcode() == Instruction::ExtractElement &&
isa<ScalableVectorType>(
- cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
+ cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
@@ -8188,7 +8195,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
}));
}
- bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
+ bool IsCommutative =
+ isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
if ((IsCommutative &&
std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
(!IsCommutative &&
@@ -8198,20 +8206,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
auto *I1 = cast<Instruction>(VL.front());
auto *I2 = cast<Instruction>(VL.back());
- for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
I2->getOperand(Op));
if (static_cast<unsigned>(count_if(
Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
- })) >= S.MainOp->getNumOperands() / 2)
+ })) >= S.getMainOp()->getNumOperands() / 2)
return false;
- if (S.MainOp->getNumOperands() > 2)
+ if (S.getMainOp()->getNumOperands() > 2)
return true;
if (IsCommutative) {
// Check permuted operands.
Candidates.clear();
- for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
I2->getOperand((Op + 1) % E));
if (any_of(
@@ -8246,7 +8254,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) ||
(isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>(
- S.OpValue) &&
+ S.getMainOp()) &&
!all_of(VL, isVectorLikeInstWithConstOps)) ||
NotProfitableForVectorization(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
@@ -8313,10 +8321,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
- auto *VL0 = cast<Instruction>(S.OpValue);
+ Instruction *VL0 = S.getMainOp();
BB = VL0->getParent();
- if (S.MainOp &&
+ if (S.getMainOp() &&
(BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
!DT->isReachableFromEntry(BB))) {
// Don't go into unreachable blocks. They may contain instructions with
@@ -8394,7 +8402,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry *TE =
newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
+ TE->dump());
// Keeps the reordered operands to avoid code duplication.
PHIHandler Handler(*DT, PH, VL);
@@ -8423,13 +8432,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// Insert new order with initial value 0, if it does not exist,
// otherwise return the iterator to the existing one.
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices, CurrentOrder);
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices, CurrentOrder);
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
+ "(ExtractValueInst/ExtractElementInst).\n";
+ TE->dump());
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
- ValueList Op0;
- Op0.assign(VL.size(), VL0->getOperand(0));
- VectorizableTree.back()->setOperand(0, Op0);
+ TE->setOperand(*this);
return;
}
case Instruction::InsertElement: {
@@ -8457,9 +8467,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CurrentOrder.clear();
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
{}, CurrentOrder);
- LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
+ TE->dump());
- TE->setOperandsInOrder();
+ TE->setOperand(*this);
buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
return;
}
@@ -8477,30 +8488,36 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
if (CurrentOrder.empty())
- LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
+ TE->dump());
else
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
- TE->setOperandsInOrder();
+ LLVM_DEBUG(dbgs()
+ << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
+ TE->dump());
break;
case TreeEntry::StridedVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
- TE->setOperandsInOrder();
- LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
+ TE->dump());
break;
case TreeEntry::ScatterVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndices);
- TE->setOperandsInOrder();
- buildTree_rec(PointerOps, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
+ TE->dump());
break;
case TreeEntry::CombinedVectorize:
case TreeEntry::NeedToGather:
llvm_unreachable("Unexpected loads state.");
}
+ TE->setOperand(*this);
+ if (State == TreeEntry::ScatterVectorize)
+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});
return;
}
case Instruction::ZExt:
@@ -8536,10 +8553,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
+ TE->dump());
- TE->setOperandsInOrder();
- for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+ TE->setOperand(*this);
+ for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
if (ShuffleOrOp == Instruction::Trunc) {
ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
@@ -8563,15 +8581,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
+ TE->dump());
ValueList Left, Right;
+ VLOperands Ops(VL, VL0, *this);
if (cast<CmpInst>(VL0)->isCommutative()) {
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
assert(P0 == CmpInst::getSwappedPredicate(P0) &&
"Commutative Predicate mismatch");
- reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+ Ops.reorder();
+ Left = Ops.getVL(0);
+ Right = Ops.getVL(1);
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
@@ -8630,29 +8652,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::Freeze: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
-
- // Sort operands of the instructions so that each side is more likely to
- // have the same opcode.
- if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
- ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *this);
- TE->setOperand(0, Left);
- TE->setOperand(1, Right);
- buildTree_rec(Left, Depth + 1, {TE, 0});
- buildTree_rec(Right, Depth + 1, {TE, 1});
- return;
- }
+ LLVM_DEBUG(
+ dbgs() << "SLP: added a new TreeEntry "
+ "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
+ TE->dump());
- TE->setOperandsInOrder();
- for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+ TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
+ for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
return;
}
case Instruction::GetElementPtr: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
+ TE->dump());
SmallVector<ValueList, 2> Operands(2);
// Prepare the operand vector for pointer operands.
for (Value *V : VL) {
@@ -8710,12 +8724,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
fixupOrderingIndices(CurrentOrder);
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices, CurrentOrder);
- TE->setOperandsInOrder();
- buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
if (Consecutive)
- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
+ TE->dump());
else
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+ LLVM_DEBUG(
+ dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
+ TE->dump());
+ TE->setOperand(*this);
+ buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
return;
}
case Instruction::Call: {
@@ -8726,93 +8743,64 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- // Sort operands of the instructions so that each side is more likely to
- // have the same opcode.
- if (isCommutative(VL0)) {
- ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *this);
- TE->setOperand(0, Left);
- TE->setOperand(1, Right);
- SmallVector<ValueList> Operands;
- for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
- Operands.emplace_back();
- if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
- continue;
- for (Value *V : VL) {
- auto *CI2 = cast<CallInst>(V);
- Operands.back().push_back(CI2->getArgOperand(I));
- }
- TE->setOperand(I, Operands.back());
- }
- buildTree_rec(Left, Depth + 1, {TE, 0});
- buildTree_rec(Right, Depth + 1, {TE, 1});
- for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
- if (Operands[I - 2].empty())
- continue;
- buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
- }
- return;
- }
- TE->setOperandsInOrder();
- for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
+ TE->dump());
+ TE->setOperand(*this, isCommutative(VL0));
+ for (unsigned I : seq<unsigned>(CI->arg_size())) {
// For scalar operands no need to create an entry since no need to
// vectorize it.
- if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI))
continue;
- ValueList Operands;
- // Prepare the operand vector.
- for (Value *V : VL) {
- auto *CI2 = cast<CallInst>(V);
- Operands.push_back(CI2->getArgOperand(I));
- }
- buildTree_rec(Operands, Depth + 1, {TE, I});
+ buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
}
return;
}
case Instruction::ShuffleVector: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+ if (S.isAltShuffle()) {
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
+ TE->dump());
+ } else {
+ assert(SLPReVec && "Only supported by REVEC.");
+ LLVM_DEBUG(
+ dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
+ TE->dump());
+ }
// Reorder operands if reordering would enable vectorization.
auto *CI = dyn_cast<CmpInst>(VL0);
- if (isa<BinaryOperator>(VL0) || CI) {
+ if (CI && any_of(VL, [](Value *V) {
+ return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
+ })) {
+ auto *MainCI = cast<CmpInst>(S.getMainOp());
+ auto *AltCI = cast<CmpInst>(S.getAltOp());
+ CmpInst::Predicate MainP = MainCI->getPredicate();
+ CmpInst::Predicate AltP = AltCI->getPredicate();
+ assert(MainP != AltP &&
+ "Expected different main/alternate predicates.");
ValueList Left, Right;
- if (!CI || all_of(VL, [](Value *V) {
- return isa<PoisonValue>(V) || cast<CmpInst>(V)->isCommutative();
- })) {
- reorderInputsAccordingToOpcode(VL, Left, Right, *this);
- } else {
- auto *MainCI = cast<CmpInst>(S.MainOp);
- auto *AltCI = cast<CmpInst>(S.AltOp);
- CmpInst::Predicate MainP = MainCI->getPredicate();
- CmpInst::Predicate AltP = AltCI->getPredicate();
- assert(MainP != AltP &&
- "Expected different main/alternate predicates.");
- // Collect operands - commute if it uses the swapped predicate or
- // alternate operation.
- for (Value *V : VL) {
- if (isa<PoisonValue>(V)) {
- Left.push_back(
- PoisonValue::get(MainCI->getOperand(0)->getType()));
- Right.push_back(
- PoisonValue::get(MainCI->getOperand(1)->getType()));
- continue;
- }
- auto *Cmp = cast<CmpInst>(V);
- Value *LHS = Cmp->getOperand(0);
- Value *RHS = Cmp->getOperand(1);
+ // Collect operands - commute if it uses the swapped predicate or
+ // alternate operation.
+ for (Value *V : VL) {
+ if (isa<PoisonValue>(V)) {
+ Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
+ Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
+ continue;
+ }
+ auto *Cmp = cast<CmpInst>(V);
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
- if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
- if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
- std::swap(LHS, RHS);
- } else {
- if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
- std::swap(LHS, RHS);
- }
- Left.push_back(LHS);
- Right.push_back(RHS);
+ if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
+ if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
+ std::swap(LHS, RHS);
+ } else {
+ if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
+ std::swap(LHS, RHS);
}
+ Left.push_back(LHS);
+ Right.push_back(RHS);
}
TE->setOperand(0, Left);
TE->setOperand(1, Right);
@@ -8821,8 +8809,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
- TE->setOperandsInOrder();
- for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+ TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
+ for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
return;
}
@@ -9707,7 +9695,7 @@ void BoUpSLP::transformNodes() {
auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
const InstructionsState &S) {
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
- for (unsigned Op : seq<unsigned>(S.MainOp->getNumOperands()))
+ for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
I2->getOperand(Op));
return all_of(
@@ -9778,7 +9766,8 @@ void BoUpSLP::transformNodes() {
Slice.front()->getType(), 2 * VF)),
1U, 2 * VF)) ||
count(Slice, Slice.front()) ==
- (isa<UndefValue>(Slice.front()) ? VF - 1 : 1)) {
+ static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
+ : 1)) {
if (IsSplat)
continue;
InstructionsState S = getSameOpcode(Slice, *TLI);
@@ -9791,7 +9780,7 @@ void BoUpSLP::transformNodes() {
// Try to vectorize reduced values or if all users are vectorized.
// For expensive instructions extra extracts might be profitable.
if ((!UserIgnoreList || E.Idx != 0) &&
- TTI->getInstructionCost(S.MainOp, CostKind) <
+ TTI->getInstructionCost(S.getMainOp(), CostKind) <
TTI::TCC_Expensive &&
!all_of(Slice, [&](Value *V) {
if (isa<PoisonValue>(V))
@@ -9818,10 +9807,10 @@ void BoUpSLP::transformNodes() {
continue;
}
} else if (S.getOpcode() == Instruction::ExtractElement ||
- (TTI->getInstructionCost(S.MainOp, CostKind) <
+ (TTI->getInstructionCost(S.getMainOp(), CostKind) <
TTI::TCC_Expensive &&
!CheckOperandsProfitability(
- S.MainOp,
+ S.getMainOp(),
cast<Instruction>(*find_if(reverse(Slice),
IsaPred<Instruction>)),
S))) {
@@ -9891,7 +9880,7 @@ void BoUpSLP::transformNodes() {
Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
// Check if profitable to represent consecutive load + reverse as strided
// load with stride -1.
- if (isReverseOrder(E.ReorderIndices) &&
+ if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
SmallVector<int> Mask;
inversePermutation(E.ReorderIndices, Mask);
@@ -9918,7 +9907,7 @@ void BoUpSLP::transformNodes() {
Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
// Check if profitable to represent consecutive load + reverse as strided
// load with stride -1.
- if (isReverseOrder(E.ReorderIndices) &&
+ if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
SmallVector<int> Mask;
inversePermutation(E.ReorderIndices, Mask);
@@ -10272,9 +10261,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
// sub-Mask into the CommonMask to estimate it later and avoid double cost
// estimation.
if ((InVectors.size() == 2 &&
- InVectors.front().get<const TreeEntry *>() == &E1 &&
- InVectors.back().get<const TreeEntry *>() == E2) ||
- (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
+ cast<const TreeEntry *>(InVectors.front()) == &E1 &&
+ cast<const TreeEntry *>(InVectors.back()) == E2) ||
+ (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
[](int Idx) { return Idx == PoisonMaskElem; }) &&
@@ -10300,7 +10289,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
VF = std::max(VF,
cast<FixedVectorType>(V1->getType())->getNumElements());
} else {
- const auto *E = InVectors.front().get<const TreeEntry *>();
+ const auto *E = cast<const TreeEntry *>(InVectors.front());
VF = std::max(VF, E->getVectorFactor());
}
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
@@ -10316,7 +10305,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
VF = std::max(VF,
getNumElements(V1->getType()));
} else {
- const auto *E = P.get<const TreeEntry *>();
+ const auto *E = cast<const TreeEntry *>(P);
VF = std::max(VF, E->getVectorFactor());
}
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
@@ -10422,9 +10411,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
};
if (!V1 && !V2 && !P2.isNull()) {
// Shuffle 2 entry nodes.
- const TreeEntry *E = P1.get<const TreeEntry *>();
+ const TreeEntry *E = cast<const TreeEntry *>(P1);
unsigned VF = E->getVectorFactor();
- const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ const TreeEntry *E2 = cast<const TreeEntry *>(P2);
CommonVF = std::max(VF, E2->getVectorFactor());
assert(all_of(Mask,
[=](int Idx) {
@@ -10456,7 +10445,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
} else if (!V1 && P2.isNull()) {
// Shuffle single entry node.
- const TreeEntry *E = P1.get<const TreeEntry *>();
+ const TreeEntry *E = cast<const TreeEntry *>(P1);
unsigned VF = E->getVectorFactor();
CommonVF = VF;
assert(
@@ -10505,7 +10494,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
} else if (V1 && !V2) {
// Shuffle vector and tree node.
unsigned VF = getVF(V1);
- const TreeEntry *E2 = P2.get<const TreeEntry *>();
+ const TreeEntry *E2 = cast<const TreeEntry *>(P2);
CommonVF = std::max(VF, E2->getVectorFactor());
assert(all_of(Mask,
[=](int Idx) {
@@ -10531,7 +10520,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
} else if (!V1 && V2) {
// Shuffle vector and tree node.
unsigned VF = getVF(V2);
- const TreeEntry *E1 = P1.get<const TreeEntry *>();
+ const TreeEntry *E1 = cast<const TreeEntry *>(P1);
CommonVF = std::max(VF, E1->getVectorFactor());
assert(all_of(Mask,
[=](int Idx) {
@@ -10769,8 +10758,8 @@ public:
if (P.value() == PoisonMaskElem)
return Mask[P.index()] == PoisonMaskElem;
auto *EI = cast<ExtractElementInst>(
- InVectors.front().get<const TreeEntry *>()->getOrdered(
- P.index()));
+ cast<const TreeEntry *>(InVectors.front())
+ ->getOrdered(P.index()));
return EI->getVectorOperand() == V1 ||
EI->getVectorOperand() == V2;
}) &&
@@ -10787,23 +10776,21 @@ public:
}
if (ForExtracts) {
// No need to add vectors here, already handled them in adjustExtracts.
- assert(
- InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
- !CommonMask.empty() &&
- all_of(enumerate(CommonMask),
- [&](auto P) {
- Value *Scalar =
- InVectors.front().get<const TreeEntry *>()->getOrdered(
- P.index());
- if (P.value() == PoisonMaskElem)
- return P.value() == Mask[P.index()] ||
- isa<UndefValue>(Scalar);
- if (isa<Constant>(V1))
- return true;
- auto *EI = cast<ExtractElementInst>(Scalar);
- return EI->getVectorOperand() == V1;
- }) &&
- "Expected only tree entry for extractelement vectors.");
+ assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
+ !CommonMask.empty() &&
+ all_of(enumerate(CommonMask),
+ [&](auto P) {
+ Value *Scalar = cast<const TreeEntry *>(InVectors[0])
+ ->getOrdered(P.index());
+ if (P.value() == PoisonMaskElem)
+ return P.value() == Mask[P.index()] ||
+ isa<UndefValue>(Scalar);
+ if (isa<Constant>(V1))
+ return true;
+ auto *EI = cast<ExtractElementInst>(Scalar);
+ return EI->getVectorOperand() == V1;
+ }) &&
+ "Expected only tree entry for extractelement vectors.");
return;
}
assert(!InVectors.empty() && !CommonMask.empty() &&
@@ -10818,7 +10805,7 @@ public:
VF = std::max(VF, InTE->getVectorFactor());
} else {
VF = std::max(
- VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
+ VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
->getNumElements());
}
InVectors.push_back(V1);
@@ -10888,7 +10875,7 @@ public:
CommonMask[Idx] = Idx;
assert(VF > 0 &&
"Expected vector length for the final value before action.");
- Value *V = Vec.get<Value *>();
+ Value *V = cast<Value *>(Vec);
Action(V, CommonMask);
InVectors.front() = V;
}
@@ -10998,14 +10985,14 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
/// Builds the arguments types vector for the given call instruction with the
/// given \p ID for the specified vector factor.
-static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
- const Intrinsic::ID ID,
- const unsigned VF,
- unsigned MinBW) {
+static SmallVector<Type *>
+buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
+ const unsigned VF, unsigned MinBW,
+ const TargetTransformInfo *TTI) {
SmallVector<Type *> ArgTys;
for (auto [Idx, Arg] : enumerate(CI->args())) {
if (ID != Intrinsic::not_intrinsic) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
ArgTys.push_back(Arg->getType());
continue;
}
@@ -11044,7 +11031,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
unsigned EntryVF = E->getVectorFactor();
auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
- bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->isGather()) {
if (allConstant(VL))
return 0;
@@ -11057,9 +11043,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
- bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
- if (!E->ReorderIndices.empty() &&
- (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
+ if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
+ !isReverseOrder(E->ReorderIndices))) {
SmallVector<int> NewMask;
if (E->getOpcode() == Instruction::Store) {
// For stores the order is actually a mask.
@@ -11070,7 +11055,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
::addMask(Mask, NewMask);
}
- if (NeedToShuffleReuses)
+ if (!E->ReuseShuffleIndices.empty())
::addMask(Mask, E->ReuseShuffleIndices);
if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
CommonCost =
@@ -11458,7 +11443,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
- CmpInst::Predicate VecPred, SwappedVecPred;
+ CmpPredicate VecPred, SwappedVecPred;
auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
match(VL0, MatchCmp))
@@ -11472,13 +11457,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return InstructionCost(TTI::TCC_Free);
auto *VI = cast<Instruction>(UniqueValues[Idx]);
- CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
- ? CmpInst::BAD_FCMP_PREDICATE
- : CmpInst::BAD_ICMP_PREDICATE;
+ CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
+ ? CmpInst::BAD_FCMP_PREDICATE
+ : CmpInst::BAD_ICMP_PREDICATE;
auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
+ // FIXME: Use CmpPredicate::getMatching here.
if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
!match(VI, MatchCmp)) ||
- (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
+ (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
+ CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
@@ -11707,9 +11694,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto GetVectorCost = [=](InstructionCost CommonCost) {
auto *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- SmallVector<Type *> ArgTys =
- buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
- It != MinBWs.end() ? It->second.first : 0);
+ SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
+ CI, ID, VecTy->getNumElements(),
+ It != MinBWs.end() ? It->second.first : 0, TTI);
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
};
@@ -11894,7 +11881,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
TE->Scalars.size() < Limit ||
((TE->getOpcode() == Instruction::ExtractElement ||
all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
- isFixedVectorShuffle(TE->Scalars, Mask)) ||
+ isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
(TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
any_of(TE->Scalars, IsaPred<LoadInst>));
};
@@ -12959,7 +12946,7 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements(
// Check that gather of extractelements can be represented as just a
// shuffle of a single/two vectors the scalars are extracted from.
std::optional<TTI::ShuffleKind> Res =
- isFixedVectorShuffle(GatheredExtracts, Mask);
+ isFixedVectorShuffle(GatheredExtracts, Mask, AC);
if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
// TODO: try to check other subsets if possible.
// Restore the original VL if attempt was not successful.
@@ -13209,14 +13196,15 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
copy(CommonMask, Mask.begin());
}
// Clear undef scalars.
- for (int I = 0, Sz = VL.size(); I < Sz; ++I)
+ for (unsigned I : seq<unsigned>(VL.size()))
if (isa<PoisonValue>(VL[I]))
- Mask[I] = PoisonMaskElem;
+ Mask[Part * VL.size() + I] = PoisonMaskElem;
return TargetTransformInfo::SK_PermuteSingleSrc;
}
// No perfect match, just shuffle, so choose the first tree node from the
// tree.
Entries.push_back(FirstEntries.front());
+ VF = FirstEntries.front()->getVectorFactor();
} else {
// Try to find nodes with the same vector factor.
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
@@ -13257,6 +13245,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
Entries.push_back(SecondEntries.front());
VF = std::max(Entries.front()->getVectorFactor(),
Entries.back()->getVectorFactor());
+ } else {
+ VF = Entries.front()->getVectorFactor();
}
}
@@ -13368,17 +13358,141 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
: Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
IsIdentity &= Mask[Idx] == Pair.second;
}
- switch (Entries.size()) {
- case 1:
- if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
- return TargetTransformInfo::SK_PermuteSingleSrc;
- break;
- case 2:
- if (EntryLanes.size() > 2 || VL.size() <= 2)
- return TargetTransformInfo::SK_PermuteTwoSrc;
- break;
- default:
- break;
+ if (ForOrder || IsIdentity || Entries.empty()) {
+ switch (Entries.size()) {
+ case 1:
+ if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
+ return TargetTransformInfo::SK_PermuteSingleSrc;
+ break;
+ case 2:
+ if (EntryLanes.size() > 2 || VL.size() <= 2)
+ return TargetTransformInfo::SK_PermuteTwoSrc;
+ break;
+ default:
+ break;
+ }
+ } else if (!isa<VectorType>(VL.front()->getType()) &&
+ (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
+ // Do the cost estimation if shuffle beneficial than buildvector.
+ SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
+ std::next(Mask.begin(), (Part + 1) * VL.size()));
+ int MinElement = SubMask.front(), MaxElement = SubMask.front();
+ for (int Idx : SubMask) {
+ if (Idx == PoisonMaskElem)
+ continue;
+ if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
+ MinElement = Idx;
+ if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
+ MaxElement = Idx;
+ }
+ assert(MaxElement >= 0 && MinElement >= 0 &&
+ MaxElement % VF >= MinElement % VF &&
+ "Expected at least single element.");
+ unsigned NewVF = std::max<unsigned>(
+ VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
+ (MaxElement % VF) -
+ (MinElement % VF) + 1));
+ if (NewVF < VF) {
+ for_each(SubMask, [&](int &Idx) {
+ if (Idx == PoisonMaskElem)
+ return;
+ Idx = (Idx % VF) - (MinElement % VF) +
+ (Idx >= static_cast<int>(VF) ? NewVF : 0);
+ });
+ VF = NewVF;
+ }
+
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ auto *VecTy = getWidenedType(VL.front()->getType(), VF);
+ auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
+ auto GetShuffleCost = [&,
+ &TTI = *TTI](ArrayRef<int> Mask,
+ ArrayRef<const TreeEntry *> Entries,
+ VectorType *VecTy) -> InstructionCost {
+ if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(
+ Mask, Entries.front()->getInterleaveFactor()))
+ return TTI::TCC_Free;
+ return ::getShuffleCost(TTI,
+ Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
+ : TTI::SK_PermuteSingleSrc,
+ VecTy, Mask, CostKind);
+ };
+ InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
+ InstructionCost FirstShuffleCost = 0;
+ SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
+ if (Entries.size() == 1 || !Entries[0]->isGather()) {
+ FirstShuffleCost = ShuffleCost;
+ } else {
+ // Transform mask to include only first entry.
+ APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+ bool IsIdentity = true;
+ for (auto [I, Idx] : enumerate(FirstMask)) {
+ if (Idx >= static_cast<int>(VF)) {
+ Idx = PoisonMaskElem;
+ } else {
+ DemandedElts.clearBit(I);
+ if (Idx != PoisonMaskElem)
+ IsIdentity &= static_cast<int>(I) == Idx;
+ }
+ }
+ if (!IsIdentity)
+ FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
+ FirstShuffleCost += TTI->getScalarizationOverhead(
+ MaskVecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind);
+ }
+ InstructionCost SecondShuffleCost = 0;
+ SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
+ if (Entries.size() == 1 || !Entries[1]->isGather()) {
+ SecondShuffleCost = ShuffleCost;
+ } else {
+ // Transform mask to include only first entry.
+ APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+ bool IsIdentity = true;
+ for (auto [I, Idx] : enumerate(SecondMask)) {
+ if (Idx < static_cast<int>(VF) && Idx >= 0) {
+ Idx = PoisonMaskElem;
+ } else {
+ DemandedElts.clearBit(I);
+ if (Idx != PoisonMaskElem) {
+ Idx -= VF;
+ IsIdentity &= static_cast<int>(I) == Idx;
+ }
+ }
+ }
+ if (!IsIdentity)
+ SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
+ SecondShuffleCost += TTI->getScalarizationOverhead(
+ MaskVecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind);
+ }
+ APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+ for (auto [I, Idx] : enumerate(SubMask))
+ if (Idx == PoisonMaskElem)
+ DemandedElts.clearBit(I);
+ InstructionCost BuildVectorCost =
+ TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind);
+ const TreeEntry *BestEntry = nullptr;
+ if (FirstShuffleCost < ShuffleCost) {
+ copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
+ BestEntry = Entries.front();
+ ShuffleCost = FirstShuffleCost;
+ }
+ if (SecondShuffleCost < ShuffleCost) {
+ copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
+ BestEntry = Entries[1];
+ ShuffleCost = SecondShuffleCost;
+ }
+ if (BuildVectorCost >= ShuffleCost) {
+ if (BestEntry) {
+ Entries.clear();
+ Entries.push_back(BestEntry);
+ }
+ return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
+ : TargetTransformInfo::SK_PermuteSingleSrc;
+ }
}
Entries.clear();
// Clear the corresponding mask elements.
@@ -13526,21 +13640,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
return Cost;
}
-// Perform operand reordering on the instructions in VL and return the reordered
-// operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- const BoUpSLP &R) {
- if (VL.empty())
- return;
- VLOperands Ops(VL, R);
- // Reorder the operands in place.
- Ops.reorder();
- Left = Ops.getVL(0);
- Right = Ops.getVL(1);
-}
-
Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
if (Res)
@@ -14481,10 +14580,10 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
VE->isSame(TE->Scalars);
}));
};
- TreeEntry *VE = getTreeEntry(S.OpValue);
+ TreeEntry *VE = getTreeEntry(S.getMainOp());
if (VE && CheckSameVE(VE))
return VE;
- auto It = MultiNodeScalars.find(S.OpValue);
+ auto It = MultiNodeScalars.find(S.getMainOp());
if (It != MultiNodeScalars.end()) {
auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
return TE != VE && CheckSameVE(TE);
@@ -14862,7 +14961,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
// non-poisonous, or by freezing the incoming scalar value first.
auto *It = find_if(Scalars, [this, E](Value *V) {
return !isa<UndefValue>(V) &&
- (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
+ (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) ||
(E->UserTreeIndices.size() == 1 &&
any_of(V->uses(), [E](const Use &U) {
// Check if the value already used in the same operation in
@@ -14934,11 +15033,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
if (Vec2) {
IsUsedInExpr = false;
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
+ IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
+ isGuaranteedNotToBePoison(Vec2, AC);
ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
} else if (Vec1) {
- bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1);
+ bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
IsUsedInExpr &= FindReusedSplat(
ExtractMask,
cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
@@ -14969,7 +15068,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
if (TEs.size() == 1) {
bool IsNotPoisonedVec =
TEs.front()->VectorizedValue
- ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue)
+ ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
: true;
IsUsedInExpr &=
FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
@@ -14981,8 +15080,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
IsNonPoisoned &=
- isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
- isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
+ isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
}
}
}
@@ -15133,7 +15232,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
return Vec;
}
- bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
+ bool IsReverseOrder =
+ !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
if (E->getOpcode() == Instruction::Store &&
@@ -15316,7 +15416,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
}
if (!IsIdentity || NumElts != NumScalars) {
Value *V2 = nullptr;
- bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
+ bool IsVNonPoisonous =
+ !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
SmallVector<int> InsertMask(Mask);
if (NumElts != NumScalars && Offset == 0) {
// Follow all insert element instructions from the current buildvector
@@ -15519,6 +15620,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
+ ICmp->setSameSign(/*B=*/false);
// Do not cast for cmps.
VecTy = cast<FixedVectorType>(V->getType());
V = FinalShuffle(V, E);
@@ -15881,9 +15984,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- SmallVector<Type *> ArgTys =
- buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
- It != MinBWs.end() ? It->second.first : 0);
+ SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
+ CI, ID, VecTy->getNumElements(),
+ It != MinBWs.end() ? It->second.first : 0, TTI);
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
VecCallCosts.first <= VecCallCosts.second;
@@ -15899,7 +16002,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
- if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
ScalarArg = CEI->getArgOperand(I);
// if decided to reduce bitwidth of abs intrinsic, it second argument
// must be set false (do not return poison, if value issigned min).
@@ -16214,6 +16317,11 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
}
Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
+ if (auto *VecI = dyn_cast<Instruction>(Vec);
+ VecI && VecI->getParent() == Builder.GetInsertBlock() &&
+ Builder.GetInsertPoint()->comesBefore(VecI))
+ VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
+ Builder.GetInsertPoint());
if (Vec->getType() != PrevVec->getType()) {
assert(Vec->getType()->isIntOrIntVectorTy() &&
PrevVec->getType()->isIntOrIntVectorTy() &&
@@ -16433,7 +16541,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
E->State == TreeEntry::StridedVectorize) &&
doesInTreeUserNeedToExtract(
Scalar, getRootEntryInstruction(*UseEntry),
- TLI);
+ TLI, TTI);
})) &&
"Scalar with nullptr User must be registered in "
"ExternallyUsedValues map or remain as scalar in vectorized "
@@ -16966,13 +17074,13 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
- if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
- doesNotNeedToSchedule(VL))
+ if (isa<PHINode>(S.getMainOp()) ||
+ isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL))
return nullptr;
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
- LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
ScheduleData *Bundle) {
@@ -17053,7 +17161,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
auto *Bundle = buildBundle(VL);
TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle->isReady()) {
- cancelScheduling(VL, S.OpValue);
+ cancelScheduling(VL, S.getMainOp());
return std::nullopt;
}
return Bundle;
@@ -17574,8 +17682,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
bool BoUpSLP::collectValuesToDemote(
const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
- unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
- bool IsTruncRoot) const {
+ const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
+ bool &IsProfitableToDemote, bool IsTruncRoot) const {
// We can always demote constants.
if (all_of(E.Scalars, IsaPred<Constant>))
return true;
@@ -17587,6 +17695,10 @@ bool BoUpSLP::collectValuesToDemote(
return true;
}
+ // Check if the node was analyzed already and must keep its original bitwidth.
+ if (NodesToKeepBWs.contains(E.Idx))
+ return false;
+
// If the value is not a vectorized instruction in the expression and not used
// by the insertelement instruction and not used in multiple vector nodes, it
// cannot be demoted.
@@ -17682,8 +17794,8 @@ bool BoUpSLP::collectValuesToDemote(
for (const TreeEntry *Op : Operands) {
unsigned Level = InitLevel;
if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
- ToDemote, Visited, Level, IsProfitableToDemote,
- IsTruncRoot)) {
+ ToDemote, Visited, NodesToKeepBWs, Level,
+ IsProfitableToDemote, IsTruncRoot)) {
if (!IsProfitableToDemote)
return false;
NeedToExit = true;
@@ -17929,7 +18041,8 @@ bool BoUpSLP::collectValuesToDemote(
// Choose the best bitwidth based on cost estimations.
auto Checker = [&](unsigned BitWidth, unsigned) {
unsigned MinBW = PowerOf2Ceil(BitWidth);
- SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
+ SmallVector<Type *> ArgTys =
+ buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
auto VecCallCosts = getVectorCallCosts(
IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
TTI, TLI, ArgTys);
@@ -17985,6 +18098,7 @@ void BoUpSLP::computeMinimumValueSizes() {
bool IsTruncRoot = false;
bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
SmallVector<unsigned> RootDemotes;
+ SmallDenseSet<unsigned, 8> NodesToKeepBWs;
if (NodeIdx != 0 &&
VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
@@ -18008,6 +18122,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// Check if the root is trunc and the next node is gather/buildvector, then
// keep trunc in scalars, which is free in most cases.
if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
+ !NodesToKeepBWs.contains(E.Idx) &&
E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
all_of(E.Scalars, [&](Value *V) {
return V->hasOneUse() || isa<Constant>(V) ||
@@ -18130,8 +18245,8 @@ void BoUpSLP::computeMinimumValueSizes() {
bool NeedToDemote = IsProfitableToDemote;
if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
- ToDemote, Visited, MaxDepthLevel, NeedToDemote,
- IsTruncRoot) ||
+ ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
+ NeedToDemote, IsTruncRoot) ||
(MaxDepthLevel <= Limit &&
!(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
(!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
@@ -18265,7 +18380,7 @@ void BoUpSLP::computeMinimumValueSizes() {
});
}
- // If the maximum bit width we compute is less than the with of the roots'
+ // If the maximum bit width we compute is less than the width of the roots'
// type, we can proceed with the narrowing. Otherwise, do nothing.
if (MaxBitWidth == 0 ||
MaxBitWidth >=
@@ -18273,6 +18388,7 @@ void BoUpSLP::computeMinimumValueSizes() {
->getBitWidth()) {
if (UserIgnoreList)
AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
+ NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
continue;
}
@@ -18432,7 +18548,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
(VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
if ((!IsAllowedSize && S.getOpcode() &&
S.getOpcode() != Instruction::Load &&
- (!S.MainOp->isSafeToRemove() ||
+ (!S.getMainOp()->isSafeToRemove() ||
any_of(ValOps.getArrayRef(),
[&](Value *V) {
return !isa<ExtractElementInst>(V) &&
@@ -18969,7 +19085,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (!S.getOpcode())
return false;
- Instruction *I0 = cast<Instruction>(S.OpValue);
+ Instruction *I0 = S.getMainOp();
// Make sure invalid types (including vector type) are rejected before
// determining vectorization factor for scalar instructions.
for (Value *V : VL) {
@@ -19381,7 +19497,7 @@ public:
// %3 = extractelement <2 x i32> %a, i32 0
// %4 = extractelement <2 x i32> %a, i32 1
// %select = select i1 %cond, i32 %3, i32 %4
- CmpInst::Predicate Pred;
+ CmpPredicate Pred;
Instruction *L1;
Instruction *L2;
@@ -19656,7 +19772,7 @@ public:
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
- const TargetLibraryInfo &TLI) {
+ const TargetLibraryInfo &TLI, AssumptionCache *AC) {
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
@@ -19700,20 +19816,35 @@ public:
return cast<Instruction>(ScalarCond);
};
+ bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
+ return isBoolLogicOp(cast<Instruction>(V));
+ });
// Return new VectorizedTree, based on previous value.
auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
if (VectorizedTree) {
// Update the final value in the reduction.
Builder.SetCurrentDebugLocation(
cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
- if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
- (isGuaranteedNotToBePoison(Res) &&
- !isGuaranteedNotToBePoison(VectorizedTree))) {
- auto It = ReducedValsToOps.find(Res);
- if (It != ReducedValsToOps.end() &&
- any_of(It->getSecond(),
- [](Instruction *I) { return isBoolLogicOp(I); }))
+ if (AnyBoolLogicOp) {
+ auto It = ReducedValsToOps.find(VectorizedTree);
+ auto It1 = ReducedValsToOps.find(Res);
+ if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
+ isGuaranteedNotToBePoison(VectorizedTree, AC) ||
+ (It != ReducedValsToOps.end() &&
+ any_of(It->getSecond(), [&](Instruction *I) {
+ return isBoolLogicOp(I) &&
+ getRdxOperand(I, 0) == VectorizedTree;
+ }))) {
+ ;
+ } else if (isGuaranteedNotToBePoison(Res, AC) ||
+ (It1 != ReducedValsToOps.end() &&
+ any_of(It1->getSecond(), [&](Instruction *I) {
+ return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
+ }))) {
std::swap(VectorizedTree, Res);
+ } else {
+ VectorizedTree = Builder.CreateFreeze(VectorizedTree);
+ }
}
return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
@@ -19722,9 +19853,6 @@ public:
// Initialize the final value in the reduction.
return Res;
};
- bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
- return isBoolLogicOp(cast<Instruction>(V));
- });
SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
ReductionOps.front().size());
for (ReductionOpsType &RdxOps : ReductionOps)
@@ -19801,7 +19929,7 @@ public:
TrackedToOrig.try_emplace(RdxVal, RV);
}
SmallVector<int> Mask;
- if (isFixedVectorShuffle(CommonCandidates, Mask)) {
+ if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
++I;
Candidates.swap(CommonCandidates);
ShuffledExtracts = true;
@@ -20116,7 +20244,7 @@ public:
// To prevent poison from leaking across what used to be sequential,
// safe, scalar boolean logic operations, the reduction operand must be
// frozen.
- if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot))
+ if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
// Emit code to correctly handle reused reduced values, if required.
@@ -20223,13 +20351,13 @@ public:
bool InitStep) {
if (!AnyBoolLogicOp)
return;
- if (isBoolLogicOp(RedOp1) &&
- ((!InitStep && LHS == VectorizedTree) ||
- getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
+ if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
+ getRdxOperand(RedOp1, 0) == LHS ||
+ isGuaranteedNotToBePoison(LHS, AC)))
return;
if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
getRdxOperand(RedOp2, 0) == RHS ||
- isGuaranteedNotToBePoison(RHS))) {
+ isGuaranteedNotToBePoison(RHS, AC))) {
std::swap(LHS, RHS);
return;
}
@@ -20515,6 +20643,8 @@ private:
case RecurKind::FMulAdd:
case RecurKind::IAnyOf:
case RecurKind::FAnyOf:
+ case RecurKind::IFindLastIV:
+ case RecurKind::FFindLastIV:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for repeated scalar.");
}
@@ -20612,6 +20742,8 @@ private:
case RecurKind::FMulAdd:
case RecurKind::IAnyOf:
case RecurKind::FAnyOf:
+ case RecurKind::IFindLastIV:
+ case RecurKind::FFindLastIV:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for reused scalars.");
}
@@ -20873,7 +21005,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
HorizontalReduction HorRdx;
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
- return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
+ return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
};
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -20979,8 +21111,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
SmallVector<Value *, 16> BuildVectorOpds;
SmallVector<int> Mask;
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
- (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
- isFixedVectorShuffle(BuildVectorOpds, Mask)))
+ (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
+ isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
return false;
if (MaxVFOnly && BuildVectorInsts.size() == 2) {
@@ -21198,8 +21330,11 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
if (R.isDeleted(I))
continue;
for (Value *Op : I->operands())
- if (auto *RootOp = dyn_cast<Instruction>(Op))
+ if (auto *RootOp = dyn_cast<Instruction>(Op)) {
Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
+ if (R.isDeleted(I))
+ break;
+ }
}
// Try to vectorize operands as vector bundles.
for (CmpInst *I : CmpInsts) {
@@ -21735,9 +21870,6 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
V2->getValueOperand()->getType()->getScalarSizeInBits())
return false;
// UndefValues are compatible with all other values.
- if (isa<UndefValue>(V->getValueOperand()) ||
- isa<UndefValue>(V2->getValueOperand()))
- return false;
if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
@@ -21751,14 +21883,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
"Different nodes should have different DFS numbers");
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
- InstructionsState S = getSameOpcode({I1, I2}, *TLI);
- if (S.getOpcode())
- return false;
return I1->getOpcode() < I2->getOpcode();
}
- if (isa<Constant>(V->getValueOperand()) &&
- isa<Constant>(V2->getValueOperand()))
- return false;
return V->getValueOperand()->getValueID() <
V2->getValueOperand()->getValueID();
};
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 4b0e12c28f07..ba62c45a4e70 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -325,6 +325,113 @@ void DependencyGraph::createNewNodes(const Interval<Instruction> &NewInterval) {
setDefUseUnscheduledSuccs(NewInterval);
}
+MemDGNode *DependencyGraph::getMemDGNodeBefore(DGNode *N,
+ bool IncludingN) const {
+ auto *I = N->getInstruction();
+ for (auto *PrevI = IncludingN ? I : I->getPrevNode(); PrevI != nullptr;
+ PrevI = PrevI->getPrevNode()) {
+ auto *PrevN = getNodeOrNull(PrevI);
+ if (PrevN == nullptr)
+ return nullptr;
+ if (auto *PrevMemN = dyn_cast<MemDGNode>(PrevN))
+ return PrevMemN;
+ }
+ return nullptr;
+}
+
+MemDGNode *DependencyGraph::getMemDGNodeAfter(DGNode *N,
+ bool IncludingN) const {
+ auto *I = N->getInstruction();
+ for (auto *NextI = IncludingN ? I : I->getNextNode(); NextI != nullptr;
+ NextI = NextI->getNextNode()) {
+ auto *NextN = getNodeOrNull(NextI);
+ if (NextN == nullptr)
+ return nullptr;
+ if (auto *NextMemN = dyn_cast<MemDGNode>(NextN))
+ return NextMemN;
+ }
+ return nullptr;
+}
+
+void DependencyGraph::notifyCreateInstr(Instruction *I) {
+ auto *MemN = dyn_cast<MemDGNode>(getOrCreateNode(I));
+ // TODO: Update the dependencies for the new node.
+
+ // Update the MemDGNode chain if this is a memory node.
+ if (MemN != nullptr) {
+ if (auto *PrevMemN = getMemDGNodeBefore(MemN, /*IncludingN=*/false)) {
+ PrevMemN->NextMemN = MemN;
+ MemN->PrevMemN = PrevMemN;
+ }
+ if (auto *NextMemN = getMemDGNodeAfter(MemN, /*IncludingN=*/false)) {
+ NextMemN->PrevMemN = MemN;
+ MemN->NextMemN = NextMemN;
+ }
+ }
+}
+
+void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) {
+ // Early return if `I` doesn't actually move.
+ BasicBlock *BB = To.getNodeParent();
+ if (To != BB->end() && &*To == I->getNextNode())
+ return;
+
+ // Maintain the DAGInterval.
+ DAGInterval.notifyMoveInstr(I, To);
+
+ // TODO: Perhaps check if this is legal by checking the dependencies?
+
+ // Update the MemDGNode chain to reflect the instr movement if necessary.
+ DGNode *N = getNodeOrNull(I);
+ if (N == nullptr)
+ return;
+ MemDGNode *MemN = dyn_cast<MemDGNode>(N);
+ if (MemN == nullptr)
+ return;
+ // First detach it from the existing chain.
+ MemN->detachFromChain();
+ // Now insert it back into the chain at the new location.
+ if (To != BB->end()) {
+ DGNode *ToN = getNodeOrNull(&*To);
+ if (ToN != nullptr) {
+ MemDGNode *PrevMemN = getMemDGNodeBefore(ToN, /*IncludingN=*/false);
+ MemDGNode *NextMemN = getMemDGNodeAfter(ToN, /*IncludingN=*/true);
+ MemN->PrevMemN = PrevMemN;
+ if (PrevMemN != nullptr)
+ PrevMemN->NextMemN = MemN;
+ MemN->NextMemN = NextMemN;
+ if (NextMemN != nullptr)
+ NextMemN->PrevMemN = MemN;
+ }
+ } else {
+ // MemN becomes the last instruction in the BB.
+ auto *TermN = getNodeOrNull(BB->getTerminator());
+ if (TermN != nullptr) {
+ MemDGNode *PrevMemN = getMemDGNodeBefore(TermN, /*IncludingN=*/false);
+ PrevMemN->NextMemN = MemN;
+ MemN->PrevMemN = PrevMemN;
+ } else {
+ // The terminator is outside the DAG interval so do nothing.
+ }
+ }
+}
+
+void DependencyGraph::notifyEraseInstr(Instruction *I) {
+ // Update the MemDGNode chain if this is a memory node.
+ if (auto *MemN = dyn_cast_or_null<MemDGNode>(getNodeOrNull(I))) {
+ auto *PrevMemN = getMemDGNodeBefore(MemN, /*IncludingN=*/false);
+ auto *NextMemN = getMemDGNodeAfter(MemN, /*IncludingN=*/false);
+ if (PrevMemN != nullptr)
+ PrevMemN->NextMemN = NextMemN;
+ if (NextMemN != nullptr)
+ NextMemN->PrevMemN = PrevMemN;
+ }
+
+ InstrToNodeMap.erase(I);
+
+ // TODO: Update the dependencies.
+}
+
Interval<Instruction> DependencyGraph::extend(ArrayRef<Instruction *> Instrs) {
if (Instrs.empty())
return {};
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b801d1863e25..6d02efc05614 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -170,9 +170,7 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
}
void VPBlockBase::setPlan(VPlan *ParentPlan) {
- assert(
- (ParentPlan->getEntry() == this || ParentPlan->getPreheader() == this) &&
- "Can only set plan on its entry or preheader block.");
+ assert(ParentPlan->getEntry() == this && "Can only set plan on its entry.");
Plan = ParentPlan;
}
@@ -207,11 +205,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
return Parent->getEnclosingBlockWithPredecessors();
}
-void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
- for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Entry)))
- delete Block;
-}
-
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
iterator It = begin();
while (It != end() && It->isPhi())
@@ -222,9 +215,11 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
ElementCount VF, unsigned UF, LoopInfo *LI,
DominatorTree *DT, IRBuilderBase &Builder,
- InnerLoopVectorizer *ILV, VPlan *Plan)
+ InnerLoopVectorizer *ILV, VPlan *Plan,
+ Loop *CurrentParentLoop, Type *CanonicalIVTy)
: TTI(TTI), VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
- LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {}
+ CurrentParentLoop(CurrentParentLoop), LVer(nullptr),
+ TypeAnalysis(CanonicalIVTy) {}
Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) {
if (Def->isLiveIn())
@@ -309,9 +304,8 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
if (!hasScalarValue(Def, LastLane)) {
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
// VPExpandSCEVRecipes can also be uniform.
- assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
- isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
- isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
+ assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe,
+ VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
"unexpected recipe found to be invariant");
IsUniform = true;
LastLane = 0;
@@ -360,7 +354,7 @@ void VPTransformState::addNewMetadata(Instruction *To,
const Instruction *Orig) {
// If the loop was versioned with memchecks, add the corresponding no-alias
// metadata.
- if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+ if (LVer && isa<LoadInst, StoreInst>(Orig))
LVer->annotateInstWithNoAlias(To, Orig);
}
@@ -476,6 +470,13 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
connectToPredecessors(State->CFG);
}
+VPIRBasicBlock *VPIRBasicBlock::clone() {
+ auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB);
+ for (VPRecipeBase &R : Recipes)
+ NewBlock->appendRecipe(R.clone());
+ return NewBlock;
+}
+
void VPBasicBlock::execute(VPTransformState *State) {
bool Replica = bool(State->Lane);
BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
@@ -502,8 +503,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
UnreachableInst *Terminator = State->Builder.CreateUnreachable();
// Register NewBB in its loop. In innermost loops its the same for all
// BB's.
- if (State->CurrentVectorLoop)
- State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
+ if (State->CurrentParentLoop)
+ State->CurrentParentLoop->addBasicBlockToLoop(NewBB, *State->LI);
State->Builder.SetInsertPoint(Terminator);
State->CFG.PrevBB = NewBB;
@@ -515,14 +516,11 @@ void VPBasicBlock::execute(VPTransformState *State) {
executeRecipes(State, NewBB);
}
-void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
- for (VPRecipeBase &R : Recipes) {
- for (auto *Def : R.definedValues())
- Def->replaceAllUsesWith(NewValue);
-
- for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
- R.setOperand(I, NewValue);
- }
+VPBasicBlock *VPBasicBlock::clone() {
+ auto *NewBlock = getPlan()->createVPBasicBlock(getName());
+ for (VPRecipeBase &R : *this)
+ NewBlock->appendRecipe(R.clone());
+ return NewBlock;
}
void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
@@ -543,7 +541,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
SmallVector<VPBlockBase *, 2> Succs(successors());
// Create new empty block after the block to split.
- auto *SplitBlock = new VPBasicBlock(getName() + ".split");
+ auto *SplitBlock = getPlan()->createVPBasicBlock(getName() + ".split");
VPBlockUtils::insertBlockAfter(SplitBlock, this);
// Finally, move the recipes starting at SplitAt to new block.
@@ -703,37 +701,30 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
VPRegionBlock *VPRegionBlock::clone() {
const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
- auto *NewRegion =
- new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator());
+ auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting,
+ getName(), isReplicator());
for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
Block->setParent(NewRegion);
return NewRegion;
}
-void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
- for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
- // Drop all references in VPBasicBlocks and replace all uses with
- // DummyValue.
- Block->dropAllReferences(NewValue);
-}
-
void VPRegionBlock::execute(VPTransformState *State) {
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
RPOT(Entry);
if (!isReplicator()) {
// Create and register the new vector loop.
- Loop *PrevLoop = State->CurrentVectorLoop;
- State->CurrentVectorLoop = State->LI->AllocateLoop();
+ Loop *PrevLoop = State->CurrentParentLoop;
+ State->CurrentParentLoop = State->LI->AllocateLoop();
BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()];
Loop *ParentLoop = State->LI->getLoopFor(VectorPH);
// Insert the new loop into the loop nest and register the new basic blocks
// before calling any utilities such as SCEV that require valid LoopInfo.
if (ParentLoop)
- ParentLoop->addChildLoop(State->CurrentVectorLoop);
+ ParentLoop->addChildLoop(State->CurrentParentLoop);
else
- State->LI->addTopLevelLoop(State->CurrentVectorLoop);
+ State->LI->addTopLevelLoop(State->CurrentParentLoop);
// Visit the VPBlocks connected to "this", starting from it.
for (VPBlockBase *Block : RPOT) {
@@ -741,7 +732,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
Block->execute(State);
}
- State->CurrentVectorLoop = PrevLoop;
+ State->CurrentParentLoop = PrevLoop;
return;
}
@@ -823,16 +814,27 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
}
#endif
-VPlan::~VPlan() {
- if (Entry) {
- VPValue DummyValue;
- for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
- Block->dropAllReferences(&DummyValue);
-
- VPBlockBase::deleteCFG(Entry);
+VPlan::VPlan(Loop *L) {
+ setEntry(createVPIRBasicBlock(L->getLoopPreheader()));
+ ScalarHeader = createVPIRBasicBlock(L->getHeader());
+}
- Preheader->dropAllReferences(&DummyValue);
- delete Preheader;
+VPlan::~VPlan() {
+ VPValue DummyValue;
+
+ for (auto *VPB : CreatedBlocks) {
+ if (auto *VPBB = dyn_cast<VPBasicBlock>(VPB)) {
+ // Replace all operands of recipes and all VPValues defined in VPBB with
+ // DummyValue so the block can be deleted.
+ for (VPRecipeBase &R : *VPBB) {
+ for (auto *Def : R.definedValues())
+ Def->replaceAllUsesWith(&DummyValue);
+
+ for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
+ R.setOperand(I, &DummyValue);
+ }
+ }
+ delete VPB;
}
for (VPValue *VPV : VPLiveInsToFree)
delete VPV;
@@ -840,34 +842,27 @@ VPlan::~VPlan() {
delete BackedgeTakenCount;
}
-VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) {
- auto *VPIRBB = new VPIRBasicBlock(IRBB);
- for (Instruction &I :
- make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
- VPIRBB->appendRecipe(new VPIRInstruction(I));
- return VPIRBB;
-}
-
VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
PredicatedScalarEvolution &PSE,
bool RequiresScalarEpilogueCheck,
bool TailFolded, Loop *TheLoop) {
- VPIRBasicBlock *Entry =
- VPIRBasicBlock::fromBasicBlock(TheLoop->getLoopPreheader());
- VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
- VPIRBasicBlock *ScalarHeader =
- VPIRBasicBlock::fromBasicBlock(TheLoop->getHeader());
- auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader);
+ auto Plan = std::make_unique<VPlan>(TheLoop);
+ VPBlockBase *ScalarHeader = Plan->getScalarHeader();
+
+ // Connect entry only to vector preheader initially. Entry will also be
+ // connected to the scalar preheader later, during skeleton creation when
+ // runtime guards are added as needed. Note that when executing the VPlan for
+ // an epilogue vector loop, the original entry block here will be replaced by
+ // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after
+ // generating code for the main vector loop.
+ VPBasicBlock *VecPreheader = Plan->createVPBasicBlock("vector.ph");
+ VPBlockUtils::connectBlocks(Plan->getEntry(), VecPreheader);
// Create SCEV and VPValue for the trip count.
-
- // Currently only loops with countable exits are vectorized, but calling
- // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
- // uncountable exits whilst also ensuring the symbolic maximum and known
- // back-edge taken count remain identical for loops with countable exits.
+ // We use the symbolic max backedge-taken-count, which works also when
+ // vectorizing loops with uncountable early exits.
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
- assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
- BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) &&
+ assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
"Invalid loop count");
ScalarEvolution &SE = *PSE.getSE();
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
@@ -877,17 +872,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// Create VPRegionBlock, with empty header and latch blocks, to be filled
// during processing later.
- VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
- VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
+ VPBasicBlock *HeaderVPBB = Plan->createVPBasicBlock("vector.body");
+ VPBasicBlock *LatchVPBB = Plan->createVPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
- auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop",
- false /*isReplicator*/);
+ auto *TopRegion = Plan->createVPRegionBlock(
+ HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/);
VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
- VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
+ VPBasicBlock *MiddleVPBB = Plan->createVPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBasicBlock *ScalarPH = Plan->createVPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
if (!RequiresScalarEpilogueCheck) {
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -902,8 +897,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// 2) If we require a scalar epilogue, there is no conditional branch as
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
- BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
- auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+ BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
+ auto *VPExitBlock = Plan->createVPIRBasicBlock(IRExitBlock);
// The connection order corresponds to the operands of the conditional branch.
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -927,7 +922,6 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
}
void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
- Value *CanonicalIVStartValue,
VPTransformState &State) {
Type *TCTy = TripCountV->getType();
// Check if the backedge taken count is needed, and if so build it.
@@ -953,41 +947,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
} else {
VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));
}
-
- // When vectorizing the epilogue loop, the canonical induction start value
- // needs to be changed from zero to the value after the main vector loop.
- // FIXME: Improve modeling for canonical IV start values in the epilogue loop.
- if (CanonicalIVStartValue) {
- VPValue *VPV = getOrAddLiveIn(CanonicalIVStartValue);
- auto *IV = getCanonicalIV();
- assert(all_of(IV->users(),
- [](const VPUser *U) {
- return isa<VPScalarIVStepsRecipe>(U) ||
- isa<VPScalarCastRecipe>(U) ||
- isa<VPDerivedIVRecipe>(U) ||
- cast<VPInstruction>(U)->getOpcode() ==
- Instruction::Add;
- }) &&
- "the canonical IV should only be used by its increment or "
- "ScalarIVSteps when resetting the start value");
- IV->setOperand(0, VPV);
- }
-}
-
-/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
-/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
-/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
-/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
- VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB);
- for (auto &R : make_early_inc_range(*VPBB)) {
- assert(!R.isPhi() && "Tried to move phi recipe to end of block");
- R.moveBefore(*IRVPBB, IRVPBB->end());
- }
-
- VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
-
- delete VPBB;
}
/// Generate the code inside the preheader and body of the vectorized loop.
@@ -997,27 +956,23 @@ void VPlan::execute(VPTransformState *State) {
// Initialize CFG state.
State->CFG.PrevVPBB = nullptr;
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
- BasicBlock *VectorPreHeader = State->CFG.PrevBB;
- State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
// Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
+ BasicBlock *VectorPreHeader = State->CFG.PrevBB;
cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);
State->CFG.DTU.applyUpdates(
{{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
- // Replace regular VPBB's for the middle and scalar preheader blocks with
- // VPIRBasicBlocks wrapping their IR blocks. The IR blocks are created during
- // skeleton creation, so we can only create the VPIRBasicBlocks now during
- // VPlan execution rather than earlier during VPlan construction.
- BasicBlock *MiddleBB = State->CFG.ExitBB;
- VPBasicBlock *MiddleVPBB = getMiddleBlock();
- BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
- replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh);
- replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
+ LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
+ << ", UF=" << getUF() << '\n');
+ setName("Final VPlan");
+ LLVM_DEBUG(dump());
// Disconnect the middle block from its single successor (the scalar loop
// header) in both the CFG and DT. The branch will be recreated during VPlan
// execution.
+ BasicBlock *MiddleBB = State->CFG.ExitBB;
+ BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
auto *BrInst = new UnreachableInst(MiddleBB->getContext());
BrInst->insertBefore(MiddleBB->getTerminator());
MiddleBB->getTerminator()->eraseFromParent();
@@ -1028,8 +983,11 @@ void VPlan::execute(VPTransformState *State) {
State->CFG.DTU.applyUpdates(
{{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
- // Generate code in the loop pre-header and body.
- for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ Entry);
+ // Generate code for the VPlan, in parts of the vector skeleton, loop body and
+ // successor blocks including the middle, exit and scalar preheader blocks.
+ for (VPBlockBase *Block : RPOT)
Block->execute(State);
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
@@ -1043,8 +1001,7 @@ void VPlan::execute(VPTransformState *State) {
if (isa<VPWidenPHIRecipe>(&R))
continue;
- if (isa<VPWidenPointerInductionRecipe>(&R) ||
- isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ if (isa<VPWidenInductionRecipe>(&R)) {
PHINode *Phi = nullptr;
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
@@ -1079,9 +1036,6 @@ void VPlan::execute(VPTransformState *State) {
}
State->CFG.DTU.flush();
- assert(State->CFG.DTU.getDomTree().verify(
- DominatorTree::VerificationLevel::Fast) &&
- "DT not preserved correctly");
}
InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
@@ -1090,6 +1044,21 @@ InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
return getVectorLoopRegion()->cost(VF, Ctx);
}
+VPRegionBlock *VPlan::getVectorLoopRegion() {
+ // TODO: Cache if possible.
+ for (VPBlockBase *B : vp_depth_first_shallow(getEntry()))
+ if (auto *R = dyn_cast<VPRegionBlock>(B))
+ return R;
+ return nullptr;
+}
+
+const VPRegionBlock *VPlan::getVectorLoopRegion() const {
+ for (const VPBlockBase *B : vp_depth_first_shallow(getEntry()))
+ if (auto *R = dyn_cast<VPRegionBlock>(B))
+ return R;
+ return nullptr;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPlan::printLiveIns(raw_ostream &O) const {
VPSlotTracker SlotTracker(this);
@@ -1134,12 +1103,9 @@ void VPlan::print(raw_ostream &O) const {
printLiveIns(O);
- if (!getPreheader()->empty()) {
- O << "\n";
- getPreheader()->print(O, "", SlotTracker);
- }
-
- for (const VPBlockBase *Block : vp_depth_first_shallow(getEntry())) {
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<const VPBlockBase *>>
+ RPOT(getEntry());
+ for (const VPBlockBase *Block : RPOT) {
O << '\n';
Block->print(O, "", SlotTracker);
}
@@ -1219,8 +1185,8 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
}
VPlan *VPlan::duplicate() {
+ unsigned NumBlocksBeforeCloning = CreatedBlocks.size();
// Clone blocks.
- VPBasicBlock *NewPreheader = Preheader->clone();
const auto &[NewEntry, __] = cloneFrom(Entry);
BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
@@ -1230,8 +1196,7 @@ VPlan *VPlan::duplicate() {
return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
}));
// Create VPlan, clone live-ins and remap operands in the cloned blocks.
- auto *NewPlan =
- new VPlan(NewPreheader, cast<VPBasicBlock>(NewEntry), NewScalarHeader);
+ auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader);
DenseMap<VPValue *, VPValue *> Old2NewVPValues;
for (VPValue *OldLiveIn : VPLiveInsToFree) {
Old2NewVPValues[OldLiveIn] =
@@ -1251,7 +1216,6 @@ VPlan *VPlan::duplicate() {
// else NewTripCount will be created and inserted into Old2NewVPValues when
// TripCount is cloned. In any case NewPlan->TripCount is updated below.
- remapOperands(Preheader, NewPreheader, Old2NewVPValues);
remapOperands(Entry, NewEntry, Old2NewVPValues);
// Initialize remaining fields of cloned VPlan.
@@ -1262,9 +1226,32 @@ VPlan *VPlan::duplicate() {
assert(Old2NewVPValues.contains(TripCount) &&
"TripCount must have been added to Old2NewVPValues");
NewPlan->TripCount = Old2NewVPValues[TripCount];
+
+ // Transfer all cloned blocks (the second half of all current blocks) from
+ // current to new VPlan.
+ unsigned NumBlocksAfterCloning = CreatedBlocks.size();
+ for (unsigned I :
+ seq<unsigned>(NumBlocksBeforeCloning, NumBlocksAfterCloning))
+ NewPlan->CreatedBlocks.push_back(this->CreatedBlocks[I]);
+ CreatedBlocks.truncate(NumBlocksBeforeCloning);
+
return NewPlan;
}
+VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) {
+ auto *VPIRBB = new VPIRBasicBlock(IRBB);
+ CreatedBlocks.push_back(VPIRBB);
+ return VPIRBB;
+}
+
+VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) {
+ auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB);
+ for (Instruction &I :
+ make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
+ VPIRBB->appendRecipe(new VPIRInstruction(I));
+ return VPIRBB;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
@@ -1303,8 +1290,6 @@ void VPlanPrinter::dump() {
OS << "edge [fontname=Courier, fontsize=30]\n";
OS << "compound=true\n";
- dumpBlock(Plan.getPreheader());
-
for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
dumpBlock(Block);
@@ -1565,7 +1550,6 @@ void VPSlotTracker::assignNames(const VPlan &Plan) {
assignName(Plan.BackedgeTakenCount);
for (VPValue *LI : Plan.VPLiveInsToFree)
assignName(LI);
- assignNames(Plan.getPreheader());
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1d828f038f9..88f3f672d3aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -236,7 +236,8 @@ public:
struct VPTransformState {
VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF,
LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
- InnerLoopVectorizer *ILV, VPlan *Plan);
+ InnerLoopVectorizer *ILV, VPlan *Plan,
+ Loop *CurrentParentLoop, Type *CanonicalIVTy);
/// Target Transform Info.
const TargetTransformInfo *TTI;
@@ -373,8 +374,8 @@ struct VPTransformState {
/// Pointer to the VPlan code is generated for.
VPlan *Plan;
- /// The loop object for the current parent region, or nullptr.
- Loop *CurrentVectorLoop = nullptr;
+ /// The parent loop object for the current scope, or nullptr.
+ Loop *CurrentParentLoop = nullptr;
/// LoopVersioning. It's only set up (non-null) if memchecks were
/// used.
@@ -621,6 +622,14 @@ public:
/// Remove all the successors of this block.
void clearSuccessors() { Successors.clear(); }
+ /// Swap successors of the block. The block must have exactly 2 successors.
+ // TODO: This should be part of introducing conditional branch recipes rather
+ // than being independent.
+ void swapSuccessors() {
+ assert(Successors.size() == 2 && "must have 2 successors to swap");
+ std::swap(Successors[0], Successors[1]);
+ }
+
/// The method which generates the output IR that correspond to this
/// VPBlockBase, thereby "executing" the VPlan.
virtual void execute(VPTransformState *State) = 0;
@@ -628,9 +637,6 @@ public:
/// Return the cost of the block.
virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
- /// Delete all blocks reachable from a given VPBlockBase, inclusive.
- static void deleteCFG(VPBlockBase *Entry);
-
/// Return true if it is legal to hoist instructions into this block.
bool isLegalToHoistInto() {
// There are currently no constraints that prevent an instruction to be
@@ -638,10 +644,6 @@ public:
return true;
}
- /// Replace all operands of VPUsers in the block with \p NewValue and also
- /// replaces all uses of VPValues defined in the block with NewValue.
- virtual void dropAllReferences(VPValue *NewValue) = 0;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void printAsOperand(raw_ostream &OS, bool PrintType = false) const {
OS << getName();
@@ -944,11 +946,6 @@ public:
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
};
- struct GEPFlagsTy {
- char IsInBounds : 1;
- GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
- };
-
private:
struct ExactFlagsTy {
char IsExact : 1;
@@ -975,7 +972,7 @@ private:
WrapFlagsTy WrapFlags;
DisjointFlagsTy DisjointFlags;
ExactFlagsTy ExactFlags;
- GEPFlagsTy GEPFlags;
+ GEPNoWrapFlags GEPFlags;
NonNegFlagsTy NonNegFlags;
FastMathFlagsTy FMFs;
unsigned AllFlags;
@@ -1012,7 +1009,7 @@ public:
ExactFlags.IsExact = Op->isExact();
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
OpType = OperationType::GEPOp;
- GEPFlags.IsInBounds = GEP->isInBounds();
+ GEPFlags = GEP->getNoWrapFlags();
} else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) {
OpType = OperationType::NonNegOp;
NonNegFlags.NonNeg = PNNI->hasNonNeg();
@@ -1052,7 +1049,7 @@ public:
protected:
template <typename IterT>
VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
- GEPFlagsTy GEPFlags, DebugLoc DL = {})
+ GEPNoWrapFlags GEPFlags, DebugLoc DL = {})
: VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp),
GEPFlags(GEPFlags) {}
@@ -1089,7 +1086,7 @@ public:
ExactFlags.IsExact = false;
break;
case OperationType::GEPOp:
- GEPFlags.IsInBounds = false;
+ GEPFlags = GEPNoWrapFlags::none();
break;
case OperationType::FPMathOp:
FMFs.NoNaNs = false;
@@ -1118,10 +1115,7 @@ public:
I->setIsExact(ExactFlags.IsExact);
break;
case OperationType::GEPOp:
- // TODO(gep_nowrap): Track the full GEPNoWrapFlags in VPlan.
- cast<GetElementPtrInst>(I)->setNoWrapFlags(
- GEPFlags.IsInBounds ? GEPNoWrapFlags::inBounds()
- : GEPNoWrapFlags::none());
+ cast<GetElementPtrInst>(I)->setNoWrapFlags(GEPFlags);
break;
case OperationType::FPMathOp:
I->setHasAllowReassoc(FMFs.AllowReassoc);
@@ -1147,11 +1141,7 @@ public:
return CmpPredicate;
}
- bool isInBounds() const {
- assert(OpType == OperationType::GEPOp &&
- "recipe doesn't have inbounds flag");
- return GEPFlags.IsInBounds;
- }
+ GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; }
/// Returns true if the recipe has fast-math flags.
bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; }
@@ -1232,6 +1222,9 @@ public:
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
+ // Returns a scalar boolean value, which is true if any lane of its single
+ // operand is true.
+ AnyOf,
};
private:
@@ -1295,7 +1288,7 @@ public:
assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
}
- VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags,
+ VPInstruction(VPValue *Ptr, VPValue *Offset, GEPNoWrapFlags Flags,
DebugLoc DL = {}, const Twine &Name = "")
: VPRecipeWithIRFlags(VPDef::VPInstructionSC,
ArrayRef<VPValue *>({Ptr, Offset}), Flags, DL),
@@ -1336,14 +1329,6 @@ public:
LLVM_DUMP_METHOD void dump() const;
#endif
- /// Return true if this instruction may modify memory.
- bool mayWriteToMemory() const {
- // TODO: we can use attributes of the called function to rule out memory
- // modifications.
- return Opcode == Instruction::Store || Opcode == Instruction::Call ||
- Opcode == Instruction::Invoke || Opcode == SLPStore;
- }
-
bool hasResult() const {
// CallInst may or may not have a result, depending on the called function.
// Conservatively return calls have results for now.
@@ -1662,7 +1647,7 @@ public:
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
DebugLoc DL = {})
- : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
+ : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
LLVMContext &Ctx = Ty->getContext();
AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID);
@@ -1697,6 +1682,9 @@ public:
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override;
+ /// Return the ID of the intrinsic.
+ Intrinsic::ID getVectorIntrinsicID() const { return VectorIntrinsicID; }
+
/// Return the scalar return type of the intrinsic.
Type *getResultType() const { return ResultTy; }
@@ -1911,10 +1899,9 @@ class VPReverseVectorPointerRecipe : public VPRecipeWithIRFlags,
public:
VPReverseVectorPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
- bool IsInBounds, DebugLoc DL)
+ GEPNoWrapFlags GEPFlags, DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPReverseVectorPointerSC,
- ArrayRef<VPValue *>({Ptr, VF}),
- GEPFlagsTy(IsInBounds), DL),
+ ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
IndexedTy(IndexedTy) {}
VP_CLASSOF_IMPL(VPDef::VPReverseVectorPointerSC)
@@ -1946,8 +1933,9 @@ public:
}
VPReverseVectorPointerRecipe *clone() override {
- return new VPReverseVectorPointerRecipe(
- getOperand(0), getVFValue(), IndexedTy, isInBounds(), getDebugLoc());
+ return new VPReverseVectorPointerRecipe(getOperand(0), getVFValue(),
+ IndexedTy, getGEPNoWrapFlags(),
+ getDebugLoc());
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1963,10 +1951,10 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
Type *IndexedTy;
public:
- VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsInBounds,
+ VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
- GEPFlagsTy(IsInBounds), DL),
+ GEPFlags, DL),
IndexedTy(IndexedTy) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
@@ -1988,8 +1976,8 @@ public:
}
VPVectorPointerRecipe *clone() override {
- return new VPVectorPointerRecipe(getOperand(0), IndexedTy, isInBounds(),
- getDebugLoc());
+ return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
+ getGEPNoWrapFlags(), getDebugLoc());
}
/// Return the cost of this VPHeaderPHIRecipe.
@@ -2088,28 +2076,72 @@ public:
}
};
+/// Base class for widened induction (VPWidenIntOrFpInductionRecipe and
+/// VPWidenPointerInductionRecipe), providing shared functionality, including
+/// retrieving the step value, induction descriptor and original phi node.
+class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
+ const InductionDescriptor &IndDesc;
+
+public:
+ VPWidenInductionRecipe(unsigned char Kind, PHINode *IV, VPValue *Start,
+ VPValue *Step, const InductionDescriptor &IndDesc,
+ DebugLoc DL)
+ : VPHeaderPHIRecipe(Kind, IV, Start, DL), IndDesc(IndDesc) {
+ addOperand(Step);
+ }
+
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPDef::VPWidenIntOrFpInductionSC ||
+ R->getVPDefID() == VPDef::VPWidenPointerInductionSC;
+ }
+
+ virtual void execute(VPTransformState &State) override = 0;
+
+ /// Returns the step value of the induction.
+ VPValue *getStepValue() { return getOperand(1); }
+ const VPValue *getStepValue() const { return getOperand(1); }
+
+ PHINode *getPHINode() const { return cast<PHINode>(getUnderlyingValue()); }
+
+ /// Returns the induction descriptor for the recipe.
+ const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
+
+ VPValue *getBackedgeValue() override {
+ // TODO: All operands of base recipe must exist and be at same index in
+ // derived recipe.
+ llvm_unreachable(
+ "VPWidenIntOrFpInductionRecipe generates its own backedge value");
+ }
+
+ VPRecipeBase &getBackedgeRecipe() override {
+ // TODO: All operands of base recipe must exist and be at same index in
+ // derived recipe.
+ llvm_unreachable(
+ "VPWidenIntOrFpInductionRecipe generates its own backedge value");
+ }
+};
+
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector values.
-class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
- PHINode *IV;
+class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
TruncInst *Trunc;
- const InductionDescriptor &IndDesc;
public:
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
- VPValue *VF, const InductionDescriptor &IndDesc)
- : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV),
- Trunc(nullptr), IndDesc(IndDesc) {
- addOperand(Step);
+ VPValue *VF, const InductionDescriptor &IndDesc,
+ DebugLoc DL)
+ : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
+ Step, IndDesc, DL),
+ Trunc(nullptr) {
addOperand(VF);
}
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
VPValue *VF, const InductionDescriptor &IndDesc,
- TruncInst *Trunc)
- : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start),
- IV(IV), Trunc(Trunc), IndDesc(IndDesc) {
- addOperand(Step);
+ TruncInst *Trunc, DebugLoc DL)
+ : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
+ Step, IndDesc, DL),
+ Trunc(Trunc) {
addOperand(VF);
}
@@ -2117,7 +2149,8 @@ public:
VPWidenIntOrFpInductionRecipe *clone() override {
return new VPWidenIntOrFpInductionRecipe(
- IV, getStartValue(), getStepValue(), getVFValue(), IndDesc, Trunc);
+ getPHINode(), getStartValue(), getStepValue(), getVFValue(),
+ getInductionDescriptor(), Trunc, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
@@ -2132,24 +2165,6 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
- VPValue *getBackedgeValue() override {
- // TODO: All operands of base recipe must exist and be at same index in
- // derived recipe.
- llvm_unreachable(
- "VPWidenIntOrFpInductionRecipe generates its own backedge value");
- }
-
- VPRecipeBase &getBackedgeRecipe() override {
- // TODO: All operands of base recipe must exist and be at same index in
- // derived recipe.
- llvm_unreachable(
- "VPWidenIntOrFpInductionRecipe generates its own backedge value");
- }
-
- /// Returns the step value of the induction.
- VPValue *getStepValue() { return getOperand(1); }
- const VPValue *getStepValue() const { return getOperand(1); }
-
VPValue *getVFValue() { return getOperand(2); }
const VPValue *getVFValue() const { return getOperand(2); }
@@ -2164,11 +2179,6 @@ public:
TruncInst *getTruncInst() { return Trunc; }
const TruncInst *getTruncInst() const { return Trunc; }
- PHINode *getPHINode() { return IV; }
-
- /// Returns the induction descriptor for the recipe.
- const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
-
/// Returns true if the induction is canonical, i.e. starting at 0 and
/// incremented by UF * VF (= the original IV is incremented by 1) and has the
/// same type as the canonical induction.
@@ -2176,7 +2186,7 @@ public:
/// Returns the scalar type of the induction.
Type *getScalarType() const {
- return Trunc ? Trunc->getType() : IV->getType();
+ return Trunc ? Trunc->getType() : getPHINode()->getType();
}
/// Returns the VPValue representing the value of this induction at
@@ -2187,10 +2197,8 @@ public:
}
};
-class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe,
+class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,
public VPUnrollPartAccessor<3> {
- const InductionDescriptor &IndDesc;
-
bool IsScalarAfterVectorization;
public:
@@ -2198,20 +2206,17 @@ public:
/// Start.
VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step,
const InductionDescriptor &IndDesc,
- bool IsScalarAfterVectorization)
- : VPHeaderPHIRecipe(VPDef::VPWidenPointerInductionSC, Phi),
- IndDesc(IndDesc),
- IsScalarAfterVectorization(IsScalarAfterVectorization) {
- addOperand(Start);
- addOperand(Step);
- }
+ bool IsScalarAfterVectorization, DebugLoc DL)
+ : VPWidenInductionRecipe(VPDef::VPWidenPointerInductionSC, Phi, Start,
+ Step, IndDesc, DL),
+ IsScalarAfterVectorization(IsScalarAfterVectorization) {}
~VPWidenPointerInductionRecipe() override = default;
VPWidenPointerInductionRecipe *clone() override {
return new VPWidenPointerInductionRecipe(
cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1),
- IndDesc, IsScalarAfterVectorization);
+ getInductionDescriptor(), IsScalarAfterVectorization, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)
@@ -2222,9 +2227,6 @@ public:
/// Returns true if only scalar values will be generated.
bool onlyScalarsGenerated(bool IsScalable);
- /// Returns the induction descriptor for the recipe.
- const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
-
/// Returns the VPValue representing the value of this induction at
/// the first unrolled part, if it exists. Returns itself if unrolling did not
/// take place.
@@ -2589,8 +2591,9 @@ class VPReductionRecipe : public VPSingleDefRecipe {
protected:
VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R,
Instruction *I, ArrayRef<VPValue *> Operands,
- VPValue *CondOp, bool IsOrdered)
- : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) {
+ VPValue *CondOp, bool IsOrdered, DebugLoc DL)
+ : VPSingleDefRecipe(SC, Operands, I, DL), RdxDesc(R),
+ IsOrdered(IsOrdered) {
if (CondOp) {
IsConditional = true;
addOperand(CondOp);
@@ -2600,16 +2603,17 @@ protected:
public:
VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered)
+ bool IsOrdered, DebugLoc DL = {})
: VPReductionRecipe(VPDef::VPReductionSC, R, I,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
- IsOrdered) {}
+ IsOrdered, DL) {}
~VPReductionRecipe() override = default;
VPReductionRecipe *clone() override {
return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
- getVecOp(), getCondOp(), IsOrdered);
+ getVecOp(), getCondOp(), IsOrdered,
+ getDebugLoc());
}
static inline bool classof(const VPRecipeBase *R) {
@@ -2664,7 +2668,7 @@ public:
VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(),
cast_or_null<Instruction>(R.getUnderlyingValue()),
ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
- R.isOrdered()) {}
+ R.isOrdered(), R.getDebugLoc()) {}
~VPReductionEVLRecipe() override = default;
@@ -2834,12 +2838,12 @@ class VPPredInstPHIRecipe : public VPSingleDefRecipe {
public:
/// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
/// nodes after merging back from a Branch-on-Mask.
- VPPredInstPHIRecipe(VPValue *PredV)
- : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV) {}
+ VPPredInstPHIRecipe(VPValue *PredV, DebugLoc DL)
+ : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV, DL) {}
~VPPredInstPHIRecipe() override = default;
VPPredInstPHIRecipe *clone() override {
- return new VPPredInstPHIRecipe(getOperand(0));
+ return new VPPredInstPHIRecipe(getOperand(0), getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC)
@@ -3203,11 +3207,6 @@ public:
return true;
}
- /// Check if the induction described by \p Kind, /p Start and \p Step is
- /// canonical, i.e. has the same start and step (of 1) as the canonical IV.
- bool isCanonical(InductionDescriptor::InductionKind Kind, VPValue *Start,
- VPValue *Step) const;
-
/// Return the cost of this VPCanonicalIVPHIRecipe.
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override {
@@ -3551,8 +3550,6 @@ public:
return make_range(begin(), getFirstNonPhi());
}
- void dropAllReferences(VPValue *NewValue) override;
-
/// Split current block at \p SplitAt by inserting a new block between the
/// current block and its successors and moving all recipes starting at
/// SplitAt to the new block. Returns the new block.
@@ -3582,12 +3579,7 @@ public:
/// Clone the current block and it's recipes, without updating the operands of
/// the cloned recipes.
- VPBasicBlock *clone() override {
- auto *NewBlock = new VPBasicBlock(getName());
- for (VPRecipeBase &R : *this)
- NewBlock->appendRecipe(R.clone());
- return NewBlock;
- }
+ VPBasicBlock *clone() override;
protected:
/// Execute the recipes in the IR basic block \p BB.
@@ -3623,20 +3615,11 @@ public:
return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
}
- /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
- /// instructions in \p IRBB, except its terminator which is managed in VPlan.
- static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB);
-
/// The method which generates the output IR instructions that correspond to
/// this VPBasicBlock, thereby "executing" the VPlan.
void execute(VPTransformState *State) override;
- VPIRBasicBlock *clone() override {
- auto *NewBlock = new VPIRBasicBlock(IRBB);
- for (VPRecipeBase &R : Recipes)
- NewBlock->appendRecipe(R.clone());
- return NewBlock;
- }
+ VPIRBasicBlock *clone() override;
BasicBlock *getIRBasicBlock() const { return IRBB; }
};
@@ -3675,13 +3658,7 @@ public:
: VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
IsReplicator(IsReplicator) {}
- ~VPRegionBlock() override {
- if (Entry) {
- VPValue DummyValue;
- Entry->dropAllReferences(&DummyValue);
- deleteCFG(Entry);
- }
- }
+ ~VPRegionBlock() override {}
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPBlockBase *V) {
@@ -3729,8 +3706,6 @@ public:
// Return the cost of this region.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
- void dropAllReferences(VPValue *NewValue) override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
/// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
@@ -3757,14 +3732,12 @@ class VPlan {
friend class VPlanPrinter;
friend class VPSlotTracker;
- /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the
- /// preheader of the vector loop.
- VPBasicBlock *Entry;
-
/// VPBasicBlock corresponding to the original preheader. Used to place
/// VPExpandSCEV recipes for expressions used during skeleton creation and the
/// rest of VPlan execution.
- VPBasicBlock *Preheader;
+ /// When this VPlan is used for the epilogue vector loop, the entry will be
+ /// replaced by a new entry block created during skeleton creation.
+ VPBasicBlock *Entry;
/// VPIRBasicBlock wrapping the header of the original scalar loop.
VPIRBasicBlock *ScalarHeader;
@@ -3809,46 +3782,50 @@ class VPlan {
/// been modeled in VPlan directly.
DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
-public:
- /// Construct a VPlan with original preheader \p Preheader, trip count \p TC,
- /// \p Entry to the plan and with \p ScalarHeader wrapping the original header
- /// of the scalar loop. At the moment, \p Preheader and \p Entry need to be
- /// disconnected, as the bypass blocks between them are not yet modeled in
- /// VPlan.
- VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry,
- VPIRBasicBlock *ScalarHeader)
- : VPlan(Preheader, Entry, ScalarHeader) {
- TripCount = TC;
- }
+ /// Blocks allocated and owned by the VPlan. They will be deleted once the
+ /// VPlan is destroyed.
+ SmallVector<VPBlockBase *> CreatedBlocks;
- /// Construct a VPlan with original preheader \p Preheader, \p Entry to
- /// the plan and with \p ScalarHeader wrapping the original header of the
- /// scalar loop. At the moment, \p Preheader and \p Entry need to be
- /// disconnected, as the bypass blocks between them are not yet modeled in
- /// VPlan.
- VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry,
- VPIRBasicBlock *ScalarHeader)
- : Entry(Entry), Preheader(Preheader), ScalarHeader(ScalarHeader) {
+ /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
+ /// wrapping the original header of the scalar loop.
+ VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
+ : Entry(Entry), ScalarHeader(ScalarHeader) {
Entry->setPlan(this);
- Preheader->setPlan(this);
- assert(Preheader->getNumSuccessors() == 0 &&
- Preheader->getNumPredecessors() == 0 &&
- "preheader must be disconnected");
assert(ScalarHeader->getNumSuccessors() == 0 &&
"scalar header must be a leaf node");
}
+public:
+ /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the
+ /// original preheader and scalar header of \p L, to be used as entry and
+ /// scalar header blocks of the new VPlan.
+ VPlan(Loop *L);
+
+ /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock
+ /// wrapping \p ScalarHeaderBB and a trip count of \p TC.
+ VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) {
+ setEntry(createVPBasicBlock("preheader"));
+ ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB);
+ TripCount = TC;
+ }
+
~VPlan();
+ void setEntry(VPBasicBlock *VPBB) {
+ Entry = VPBB;
+ VPBB->setPlan(this);
+ }
+
/// Create initial VPlan, having an "entry" VPBasicBlock (wrapping
- /// original scalar pre-header ) which contains SCEV expansions that need
- /// to happen before the CFG is modified; a VPBasicBlock for the vector
- /// pre-header, followed by a region for the vector loop, followed by the
- /// middle VPBasicBlock. If a check is needed to guard executing the scalar
- /// epilogue loop, it will be added to the middle block, together with
- /// VPBasicBlocks for the scalar preheader and exit blocks.
- /// \p InductionTy is the type of the canonical induction and used for related
- /// values, like the trip count expression.
+ /// original scalar pre-header) which contains SCEV expansions that need
+ /// to happen before the CFG is modified (when executing a VPlan for the
+ /// epilogue vector loop, the original entry needs to be replaced by a new
+ /// one); a VPBasicBlock for the vector pre-header, followed by a region for
+ /// the vector loop, followed by the middle VPBasicBlock. If a check is needed
+ /// to guard executing the scalar epilogue loop, it will be added to the
+ /// middle block, together with VPBasicBlocks for the scalar preheader and
+ /// exit blocks. \p InductionTy is the type of the canonical induction and
+ /// used for related values, like the trip count expression.
static VPlanPtr createInitialVPlan(Type *InductionTy,
PredicatedScalarEvolution &PSE,
bool RequiresScalarEpilogueCheck,
@@ -3856,7 +3833,7 @@ public:
/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
- Value *CanonicalIVStartValue, VPTransformState &State);
+ VPTransformState &State);
/// Generate the IR code for this VPlan.
void execute(VPTransformState *State);
@@ -3873,26 +3850,22 @@ public:
}
/// Returns the VPRegionBlock of the vector loop.
- VPRegionBlock *getVectorLoopRegion() {
- return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
- }
- const VPRegionBlock *getVectorLoopRegion() const {
- return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
- }
+ VPRegionBlock *getVectorLoopRegion();
+ const VPRegionBlock *getVectorLoopRegion() const;
/// Returns the 'middle' block of the plan, that is the block that selects
/// whether to execute the scalar tail loop or the exit block from the loop
/// latch.
const VPBasicBlock *getMiddleBlock() const {
- return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front());
}
VPBasicBlock *getMiddleBlock() {
- return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front());
}
/// Return the VPBasicBlock for the preheader of the scalar loop.
VPBasicBlock *getScalarPreheader() const {
- return cast<VPBasicBlock>(ScalarHeader->getSinglePredecessor());
+ return cast<VPBasicBlock>(getScalarHeader()->getSinglePredecessor());
}
/// Return the VPIRBasicBlock wrapping the header of the scalar loop.
@@ -4027,13 +4000,52 @@ public:
SCEVToExpansion[S] = V;
}
- /// \return The block corresponding to the original preheader.
- VPBasicBlock *getPreheader() { return Preheader; }
- const VPBasicBlock *getPreheader() const { return Preheader; }
-
/// Clone the current VPlan, update all VPValues of the new VPlan and cloned
/// recipes to refer to the clones, and return it.
VPlan *duplicate();
+
+ /// Create a new VPBasicBlock with \p Name and containing \p Recipe if
+ /// present. The returned block is owned by the VPlan and deleted once the
+ /// VPlan is destroyed.
+ VPBasicBlock *createVPBasicBlock(const Twine &Name,
+ VPRecipeBase *Recipe = nullptr) {
+ auto *VPB = new VPBasicBlock(Name, Recipe);
+ CreatedBlocks.push_back(VPB);
+ return VPB;
+ }
+
+ /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
+ /// IsReplicator is true, the region is a replicate region. The returned block
+ /// is owned by the VPlan and deleted once the VPlan is destroyed.
+ VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
+ const std::string &Name = "",
+ bool IsReplicator = false) {
+ auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
+ CreatedBlocks.push_back(VPB);
+ return VPB;
+ }
+
+ /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set
+ /// to nullptr. If \p IsReplicator is true, the region is a replicate region.
+ /// The returned block is owned by the VPlan and deleted once the VPlan is
+ /// destroyed.
+ VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
+ bool IsReplicator = false) {
+ auto *VPB = new VPRegionBlock(Name, IsReplicator);
+ CreatedBlocks.push_back(VPB);
+ return VPB;
+ }
+
+ /// Create a VPIRBasicBlock wrapping \p IRBB, but do not create
+ /// VPIRInstructions wrapping the instructions in t\p IRBB. The returned
+ /// block is owned by the VPlan and deleted once the VPlan is destroyed.
+ VPIRBasicBlock *createEmptyVPIRBasicBlock(BasicBlock *IRBB);
+
+ /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
+ /// instructions in \p IRBB, except its terminator which is managed by the
+ /// successors of the block in VPlan. The returned block is owned by the VPlan
+ /// and deleted once the VPlan is destroyed.
+ VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB);
};
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -4179,8 +4191,6 @@ public:
"Can't connect two block with different parents");
assert((SuccIdx != -1u || From->getNumSuccessors() < 2) &&
"Blocks can't have more than two successors.");
- assert((PredIdx != -1u || To->getNumPredecessors() < 2) &&
- "Blocks can't have more than two predecessors.");
if (SuccIdx == -1u)
From->appendSuccessor(To);
else
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 969d07b229e4..35497a7431f7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -61,10 +61,16 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case Instruction::ICmp:
case VPInstruction::ActiveLaneMask:
return inferScalarType(R->getOperand(1));
+ case VPInstruction::ComputeReductionResult: {
+ auto *PhiR = cast<VPReductionPHIRecipe>(R->getOperand(0));
+ auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ return OrigPhi->getType();
+ }
case VPInstruction::ExplicitVectorLength:
return Type::getIntNTy(Ctx, 32);
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::Not:
+ case VPInstruction::ResumePhi:
return SetResultTyFromOp();
case VPInstruction::ExtractFromEnd: {
Type *BaseTy = inferScalarType(R->getOperand(0));
@@ -127,7 +133,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenLoadEVLRecipe>(R)) &&
+ assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
"Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 6e633739fcc3..76ed578424df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -182,7 +182,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
// Create new VPBB.
StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();
LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
- VPBasicBlock *VPBB = new VPBasicBlock(Name);
+ VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name);
BB2VPBB[BB] = VPBB;
// Get or create a region for the loop containing BB.
@@ -204,7 +204,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
if (LoopOfBB == TheLoop) {
RegionOfVPBB = Plan.getVectorLoopRegion();
} else {
- RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/);
+ RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/);
RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]);
}
RegionOfVPBB->setEntry(VPBB);
@@ -357,12 +357,10 @@ void PlainCFGBuilder::buildPlainCFG() {
BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB;
VectorHeaderVPBB->clearSuccessors();
VectorLatchVPBB->clearPredecessors();
- if (TheLoop->getHeader() != TheLoop->getLoopLatch()) {
+ if (TheLoop->getHeader() != TheLoop->getLoopLatch())
BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB;
- } else {
+ else
TheRegion->setExiting(VectorHeaderVPBB);
- delete VectorLatchVPBB;
- }
// 1. Scan the body of the loop in a topological order to visit each basic
// block after having visited its predecessor basic blocks. Create a VPBB for
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index 9e8f9f3f4002..ad6e2ad90a96 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -32,11 +32,11 @@ class Loop;
class LoopInfo;
class VPRegionBlock;
class VPlan;
-class VPlanTestBase;
+class VPlanTestIRBase;
/// Main class to build the VPlan H-CFG for an incoming IR.
class VPlanHCFGBuilder {
- friend VPlanTestBase;
+ friend VPlanTestIRBase;
private:
// The outermost loop of the input loop nest considered for vectorization.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 18e5e2996c82..ec3c203a61b3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -78,6 +78,8 @@ template <unsigned BitWidth = 0> struct specific_intval {
if (!VPV->isLiveIn())
return false;
Value *V = VPV->getLiveInIRValue();
+ if (!V)
+ return false;
const auto *CI = dyn_cast<ConstantInt>(V);
if (!CI && V->getType()->isVectorTy())
if (const auto *C = dyn_cast<Constant>(V))
@@ -136,7 +138,8 @@ struct MatchRecipeAndOpcode<Opcode, RecipeTy> {
// Check for recipes that do not have opcodes.
if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
- std::is_same<RecipeTy, VPWidenSelectRecipe>::value)
+ std::is_same<RecipeTy, VPWidenSelectRecipe>::value ||
+ std::is_same<RecipeTy, VPDerivedIVRecipe>::value)
return DefR;
else
return DefR && DefR->getOpcode() == Opcode;
@@ -382,6 +385,17 @@ inline VPScalarIVSteps_match<Op0_t, Op1_t> m_ScalarIVSteps(const Op0_t &Op0,
const Op1_t &Op1) {
return VPScalarIVSteps_match<Op0_t, Op1_t>(Op0, Op1);
}
+
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+using VPDerivedIV_match =
+ Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, 0, false, VPDerivedIVRecipe>;
+
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline VPDerivedIV_match<Op0_t, Op1_t, Op2_t>
+m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
+ return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2});
+}
+
} // namespace VPlanPatternMatch
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ef5f6e22f822..77c08839dbfa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -57,6 +57,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case Instruction::Or:
case Instruction::ICmp:
case Instruction::Select:
+ case VPInstruction::AnyOf:
case VPInstruction::Not:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
@@ -361,6 +362,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
+ case VPInstruction::AnyOf:
return true;
default:
return false;
@@ -565,6 +567,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
ReducedPartRdx = Builder.CreateBinOp(
(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
+ else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
+ ReducedPartRdx =
+ createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart);
else
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
}
@@ -573,7 +578,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
// Create the reduction after the loop. Note that inloop reductions create
// the target reduction in the loop using a Reduction recipe.
if ((State.VF.isVector() ||
- RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
+ RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) &&
!PhiR->isInLoop()) {
ReducedPartRdx =
createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
@@ -615,8 +621,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
"can only generate first lane for PtrAdd");
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *Addend = State.get(getOperand(1), VPLane(0));
- return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name)
- : Builder.CreatePtrAdd(Ptr, Addend, Name);
+ return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::ResumePhi: {
Value *IncomingFromVPlanPred =
@@ -624,18 +629,22 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *IncomingFromOtherPreds =
State.get(getOperand(1), /* IsScalar */ true);
auto *NewPhi =
- Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
+ Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name);
BasicBlock *VPlanPred =
State.CFG
- .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
+ .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])];
NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
- assert(OtherPred != VPlanPred &&
- "VPlan predecessors should not be connected yet");
+ if (OtherPred == VPlanPred)
+ continue;
NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
}
return NewPhi;
}
+ case VPInstruction::AnyOf: {
+ Value *A = State.get(getOperand(0));
+ return Builder.CreateOrReduce(A);
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
@@ -644,7 +653,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
- getOpcode() == VPInstruction::ComputeReductionResult;
+ getOpcode() == VPInstruction::ComputeReductionResult ||
+ getOpcode() == VPInstruction::AnyOf;
}
bool VPInstruction::isSingleScalar() const {
@@ -707,6 +717,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
return false;
case Instruction::ICmp:
case Instruction::Select:
+ case Instruction::Or:
case VPInstruction::PtrAdd:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
@@ -802,6 +813,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::PtrAdd:
O << "ptradd";
break;
+ case VPInstruction::AnyOf:
+ O << "any-of";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -819,12 +833,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
void VPIRInstruction::execute(VPTransformState &State) {
assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
"Only PHINodes can have extra operands");
- if (getNumOperands() == 1) {
- VPValue *ExitValue = getOperand(0);
+ for (const auto &[Idx, Op] : enumerate(operands())) {
+ VPValue *ExitValue = Op;
auto Lane = vputils::isUniformAfterVectorization(ExitValue)
? VPLane::getFirstLane()
: VPLane::getLastLaneForVF(State.VF);
- auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
+ VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
+ auto *PredVPBB = Pred->getExitingBasicBlock();
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
// Set insertion point in PredBB in case an extract needs to be generated.
// TODO: Model extracts explicitly.
@@ -857,11 +872,13 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
O << Indent << "IR " << I;
if (getNumOperands() != 0) {
- assert(getNumOperands() == 1 && "can have at most 1 operand");
- O << " (extra operand: ";
- getOperand(0)->printAsOperand(O, SlotTracker);
- O << " from ";
- getParent()->getPredecessors()[0]->printAsOperand(O);
+ O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
+ interleaveComma(
+ enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
+ Op.value()->printAsOperand(O, SlotTracker);
+ O << " from ";
+ getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
+ });
O << ")";
}
}
@@ -950,7 +967,8 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
Value *Arg;
- if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
+ if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
+ State.TTI))
Arg = State.get(I.value(), VPLane(0));
else
Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
@@ -964,7 +982,8 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
Module *M = State.Builder.GetInsertBlock()->getModule();
Function *VectorF =
Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
- assert(VectorF && "Can't retrieve vector intrinsic.");
+ assert(VectorF &&
+ "Can't retrieve vector intrinsic or vector-predication intrinsics.");
auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
SmallVector<OperandBundleDef, 1> OpBundles;
@@ -1012,11 +1031,11 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
Arguments.push_back(V);
}
- Type *RetTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
SmallVector<Type *> ParamTys;
for (unsigned I = 0; I != getNumOperands(); ++I)
ParamTys.push_back(
- ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
+ toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
@@ -1184,7 +1203,7 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
Type *ScalarTy = Ctx.Types.inferScalarType(this);
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
VPValue *Op0, *Op1;
@@ -1254,8 +1273,12 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
getFastMathFlags().print(O);
break;
case OperationType::GEPOp:
- if (GEPFlags.IsInBounds)
+ if (GEPFlags.isInBounds())
O << " inbounds";
+ else if (GEPFlags.hasNoUnsignedSignedWrap())
+ O << " nusw";
+ if (GEPFlags.hasNoUnsignedWrap())
+ O << " nuw";
break;
case OperationType::NonNegOp:
if (NonNegFlags.NonNeg)
@@ -1361,7 +1384,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
switch (Opcode) {
case Instruction::FNeg: {
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticInstrCost(
Opcode, VectorTy, CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
@@ -1399,7 +1422,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
getOperand(1)->isDefinedOutsideLoopRegions())
RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
SmallVector<const Value *, 4> Operands;
@@ -1412,13 +1435,13 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
}
case Instruction::Freeze: {
// This opcode is unknown. Assume that it is the same as 'mul'.
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
- Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+ Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
@@ -1546,8 +1569,8 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
}
auto *SrcTy =
- cast<VectorType>(ToVectorTy(Ctx.Types.inferScalarType(Operand), VF));
- auto *DestTy = cast<VectorType>(ToVectorTy(getResultType(), VF));
+ cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
+ auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
// Arm TTI will use the underlying instruction to determine the cost.
return Ctx.TTI.getCastInstrCost(
Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
@@ -1559,7 +1582,7 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-CAST ";
printAsOperand(O, SlotTracker);
- O << " = " << Instruction::getOpcodeName(Opcode) << " ";
+ O << " = " << Instruction::getOpcodeName(Opcode);
printFlags(O);
printOperands(O, SlotTracker);
O << " to " << *getResultType();
@@ -1572,10 +1595,10 @@ InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
}
/// This function adds
-/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
-/// to each vector element of Val. The sequence starts at StartIndex.
+/// (0 * Step, 1 * Step, 2 * Step, ...)
+/// to each vector element of Val.
/// \p Opcode is relevant for FP induction variable.
-static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
+static Value *getStepVector(Value *Val, Value *Step,
Instruction::BinaryOps BinOp, ElementCount VF,
IRBuilderBase &Builder) {
assert(VF.isVector() && "only vector VFs are supported");
@@ -1600,11 +1623,7 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
}
Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
- // Splat the StartIdx
- Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
-
if (STy->isIntegerTy()) {
- InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
// FIXME: The newly created binary instructions should contain nsw/nuw
@@ -1617,7 +1636,6 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction");
InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
- InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
Step = Builder.CreateVectorSplat(VLen, Step);
Value *MulOp = Builder.CreateFMul(InitVec, Step);
@@ -1638,12 +1656,13 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
const InductionDescriptor &ID = getInductionDescriptor();
TruncInst *Trunc = getTruncInst();
IRBuilderBase &Builder = State.Builder;
- assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+ assert(getPHINode()->getType() == ID.getStartValue()->getType() &&
+ "Types must match");
assert(State.VF.isVector() && "must have vector VF");
// The value from the original loop to which we are mapping the new induction
// variable.
- Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode();
// Fast-math-flags propagate from the original induction instruction.
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
@@ -1668,10 +1687,9 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
}
- Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
- Value *SteppedStart = getStepVector(
- SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
+ Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(),
+ State.VF, State.Builder);
// We create vector phi nodes for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
@@ -1711,14 +1729,14 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
// factor. The last of those goes into the PHI.
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
- VecInd->setDebugLoc(EntryVal->getDebugLoc());
+ VecInd->setDebugLoc(getDebugLoc());
State.set(this, VecInd);
Instruction *LastInduction = cast<Instruction>(
Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
if (isa<TruncInst>(EntryVal))
State.addMetadata(LastInduction, EntryVal);
- LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+ LastInduction->setDebugLoc(getDebugLoc());
VecInd->addIncoming(SteppedStart, VectorPH);
// Add induction update using an incorrect block temporarily. The phi node
@@ -1732,20 +1750,13 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << Indent << "WIDEN-INDUCTION";
- if (getTruncInst()) {
- O << "\\l\"";
- O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
- O << " +\n" << Indent << "\" ";
- getVPValue(0)->printAsOperand(O, SlotTracker);
- } else
- O << " " << VPlanIngredient(IV);
-
- O << ", ";
- getStepValue()->printAsOperand(O, SlotTracker);
+ O << Indent;
+ printAsOperand(O, SlotTracker);
+ O << " = WIDEN-INDUCTION ";
+ printOperands(O, SlotTracker);
- O << ", ";
- getVFValue()->printAsOperand(O, SlotTracker);
+ if (auto *TI = getTruncInst())
+ O << " (truncated to " << *TI->getType() << ")";
}
#endif
@@ -1896,9 +1907,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
for (unsigned I = 0, E = getNumOperands(); I != E; I++)
Ops.push_back(State.get(getOperand(I), VPLane(0)));
- auto *NewGEP =
- State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
- ArrayRef(Ops).drop_front(), "", isInBounds());
+ auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
+ ArrayRef(Ops).drop_front(), "",
+ getGEPNoWrapFlags());
Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
State.set(this, Splat);
State.addMetadata(Splat, GEP);
@@ -1924,7 +1935,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
// but it should be a vector, otherwise.
auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
- Indices, "", isInBounds());
+ Indices, "", getGEPNoWrapFlags());
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector");
State.set(this, NewGEP);
@@ -1975,9 +1986,10 @@ void VPReverseVectorPointerRecipe::execute(VPTransformState &State) {
// LastLane = 1 - RunTimeVF
Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
Value *Ptr = State.get(getOperand(0), VPLane(0));
- bool InBounds = isInBounds();
- Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
- ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", InBounds);
+ Value *ResultPtr =
+ Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
+ ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
+ getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
@@ -1987,9 +1999,8 @@ void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
- O << " = reverse-vector-pointer ";
- if (isInBounds())
- O << "inbounds ";
+ O << " = reverse-vector-pointer";
+ printFlags(O);
printOperands(O, SlotTracker);
}
#endif
@@ -2001,10 +2012,10 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
- bool InBounds = isInBounds();
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
- Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
+ Value *ResultPtr =
+ Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
@@ -2066,8 +2077,8 @@ InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
if (vputils::onlyFirstLaneUsed(this))
return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
- Type *ResultTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
- Type *CmpTy = ToVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
+ Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
return (getNumIncomingValues() - 1) *
Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
CmpInst::BAD_ICMP_PREDICATE, CostKind);
@@ -2104,6 +2115,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
// Propagate the fast-math flags carried by the underlying instruction.
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+ State.setDebugLocFrom(getDebugLoc());
Value *NewVecOp = State.get(getVecOp());
if (VPValue *Cond = getCondOp()) {
Value *NewCond = State.get(Cond, State.VF.isScalar());
@@ -2188,7 +2200,7 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
RecurKind RdxKind = RdxDesc.getRecurrenceKind();
Type *ElementTy = Ctx.Types.inferScalarType(this);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ElementTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
unsigned Opcode = RdxDesc.getOpcode();
@@ -2380,6 +2392,7 @@ InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
}
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFrom(getDebugLoc());
assert(State.Lane && "Predicated instruction PHI works per instance.");
Instruction *ScalarPredInst =
cast<Instruction>(State.get(getOperand(0), *State.Lane));
@@ -2439,7 +2452,7 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
const Align Alignment =
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
@@ -2586,7 +2599,7 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
// legacy model, it will always calculate the cost of mask.
// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
// don't need to compare to the legacy cost model.
- Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
const Align Alignment =
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
@@ -2707,7 +2720,7 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
// legacy model, it will always calculate the cost of mask.
// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
// don't need to compare to the legacy cost model.
- Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+ Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
const Align Alignment =
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
@@ -3075,7 +3088,7 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
Type *ValTy = Ctx.Types.inferScalarType(
getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
: getStoredValues()[InsertPosIdx]);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -3111,31 +3124,14 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-bool VPCanonicalIVPHIRecipe::isCanonical(
- InductionDescriptor::InductionKind Kind, VPValue *Start,
- VPValue *Step) const {
- // Must be an integer induction.
- if (Kind != InductionDescriptor::IK_IntInduction)
- return false;
- // Start must match the start value of this canonical induction.
- if (Start != getStartValue())
- return false;
-
- // If the step is defined by a recipe, it is not a ConstantInt.
- if (Step->getDefiningRecipe())
- return false;
-
- ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
- return StepC && StepC->isOne();
-}
-
bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
return IsScalarAfterVectorization &&
(!IsScalable || vputils::onlyFirstLaneUsed(this));
}
void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
- assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
+ assert(getInductionDescriptor().getKind() ==
+ InductionDescriptor::IK_PtrInduction &&
"Not a pointer induction according to InductionDescriptor!");
assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
"Unexpected type.");
@@ -3160,6 +3156,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
CanonicalIV->getIterator());
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+ NewPointerPhi->setDebugLoc(getDebugLoc());
} else {
// The recipe has been unrolled. In that case, fetch the single pointer phi
// shared among all unrolled parts of the recipe.
@@ -3170,8 +3167,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
// A pointer induction, performed by using a gep
BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
- Value *ScalarStepValue = State.get(getOperand(1), VPLane(0));
- Type *PhiType = IndDesc.getStep()->getType();
+ Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
+ Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
// Add induction update using an incorrect block temporarily. The phi node
// will be fixed after VPlan execution. Note that at this point the latch
@@ -3223,7 +3220,8 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
printAsOperand(O, SlotTracker);
O << " = WIDEN-POINTER-INDUCTION ";
getStartValue()->printAsOperand(O, SlotTracker);
- O << ", " << *IndDesc.getStep();
+ O << ", ";
+ getStepValue()->printAsOperand(O, SlotTracker);
if (getNumOperands() == 4) {
O << ", ";
getOperand(2)->printAsOperand(O, SlotTracker);
@@ -3235,13 +3233,22 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
void VPExpandSCEVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "cannot be used in per-lane");
+ if (State.ExpandedSCEVs.contains(Expr)) {
+ // SCEV Expr has already been expanded, result must already be set. At the
+ // moment we have to execute the entry block twice (once before skeleton
+ // creation to get expanded SCEVs used by the skeleton and once during
+ // regular VPlan execution).
+ State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]);
+ assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] &&
+ "Results must match");
+ return;
+ }
+
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
SCEVExpander Exp(SE, DL, "induction");
Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
&*State.Builder.GetInsertPoint());
- assert(!State.ExpandedSCEVs.contains(Expr) &&
- "Same SCEV expanded multiple times");
State.ExpandedSCEVs[Expr] = Res;
State.set(this, Res, VPLane(0));
}
@@ -3324,7 +3331,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
SmallVector<int> Mask(VF.getKnownMinValue());
std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
Type *VectorTy =
- ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+ toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
cast<VectorType>(VectorTy), Mask, CostKind,
@@ -3358,7 +3365,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
: VectorType::get(StartV->getType(), State.VF);
BasicBlock *HeaderBB = State.CFG.PrevBB;
- assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
+ assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
"recipe must be in the vector loop header");
auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
Phi->insertBefore(HeaderBB->getFirstInsertionPt());
@@ -3380,6 +3387,22 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
Builder.SetInsertPoint(VectorPH->getTerminator());
StartV = Iden = State.get(StartVPV);
}
+ } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
+ // [I|F]FindLastIV will use a sentinel value to initialize the reduction
+ // phi or the resume value from the main vector loop when vectorizing the
+ // epilogue loop. In the exit block, ComputeReductionResult will generate
+ // checks to verify if the reduction result is the sentinel value. If the
+ // result is the sentinel value, it will be corrected back to the start
+ // value.
+ // TODO: The sentinel value is not always necessary. When the start value is
+ // a constant, and smaller than the start value of the induction variable,
+ // the start value can be directly used to initialize the reduction phi.
+ Iden = StartV;
+ if (!ScalarPHI) {
+ IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden);
+ }
} else {
Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
RdxDesc.getFastMathFlags());
@@ -3483,7 +3506,7 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
void VPScalarPHIRecipe::execute(VPTransformState &State) {
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
- Value *Start = State.get(getOperand(0), VPLane(0));
+ Value *Start = State.get(getStartValue(), VPLane(0));
PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name);
Phi->addIncoming(Start, VectorPH);
Phi->setDebugLoc(getDebugLoc());
@@ -3493,7 +3516,7 @@ void VPScalarPHIRecipe::execute(VPTransformState &State) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << Indent << "SCALAR-PHI";
+ O << Indent << "SCALAR-PHI ";
printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cee83d1015b5..8ac2bd5160c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -61,8 +61,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
- NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step,
- &Plan->getVF(), *II);
+ NewRecipe = new VPWidenIntOrFpInductionRecipe(
+ Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
} else {
assert(isa<VPInstruction>(&Ingredient) &&
"only VPInstructions expected here");
@@ -217,7 +217,7 @@ static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
// is connected to a successor replicate region with the same predicate by a
// single, empty VPBasicBlock.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
- SetVector<VPRegionBlock *> DeletedRegions;
+ SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
// Collect replicate regions followed by an empty block, followed by another
// replicate region with matching masks to process front. This is to avoid
@@ -248,7 +248,7 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
// Move recipes from Region1 to its successor region, if both are triangles.
for (VPRegionBlock *Region1 : WorkList) {
- if (DeletedRegions.contains(Region1))
+ if (TransformedRegions.contains(Region1))
continue;
auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
@@ -294,12 +294,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
}
VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
- DeletedRegions.insert(Region1);
+ TransformedRegions.insert(Region1);
}
- for (VPRegionBlock *ToDelete : DeletedRegions)
- delete ToDelete;
- return !DeletedRegions.empty();
+ return !TransformedRegions.empty();
}
static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
@@ -310,7 +308,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
assert(Instr->getParent() && "Predicated instruction not in any basic block");
auto *BlockInMask = PredRecipe->getMask();
auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
- auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+ auto *Entry =
+ Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
// Replace predicated replicate recipe with a replicate recipe without a
// mask but in the replicate region.
@@ -318,17 +317,21 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
PredRecipe->getUnderlyingInstr(),
make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
PredRecipe->isUniform());
- auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
+ auto *Pred =
+ Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
VPPredInstPHIRecipe *PHIRecipe = nullptr;
if (PredRecipe->getNumUsers() != 0) {
- PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask);
+ PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
+ RecipeWithoutMask->getDebugLoc());
PredRecipe->replaceAllUsesWith(PHIRecipe);
PHIRecipe->setOperand(0, RecipeWithoutMask);
}
PredRecipe->eraseFromParent();
- auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
- VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
+ auto *Exiting =
+ Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ VPRegionBlock *Region =
+ Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
@@ -377,7 +380,8 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
continue;
auto *PredVPBB =
dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
- if (!PredVPBB || PredVPBB->getNumSuccessors() != 1)
+ if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
+ isa<VPIRBasicBlock>(PredVPBB))
continue;
WorkList.push_back(VPBB);
}
@@ -394,7 +398,7 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
VPBlockUtils::disconnectBlocks(VPBB, Succ);
VPBlockUtils::connectBlocks(PredVPBB, Succ);
}
- delete VPBB;
+ // VPBB is now dead and will be cleaned up when the plan gets destroyed.
}
return !WorkList.empty();
}
@@ -526,11 +530,8 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
VPValue *StartV, VPValue *Step, VPBuilder &Builder) {
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
- VPSingleDefRecipe *BaseIV = CanonicalIV;
- if (!CanonicalIV->isCanonical(Kind, StartV, Step)) {
- BaseIV = Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step,
- "offset.idx");
- }
+ VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
+ Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
// Truncate base induction if needed.
Type *CanonicalIVType = CanonicalIV->getScalarType();
@@ -661,6 +662,151 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
}
}
+/// Try to simplify recipe \p R.
+static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
+ using namespace llvm::VPlanPatternMatch;
+
+ if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
+ // Try to remove redundant blend recipes.
+ SmallPtrSet<VPValue *, 4> UniqueValues;
+ if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
+ UniqueValues.insert(Blend->getIncomingValue(0));
+ for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
+ if (!match(Blend->getMask(I), m_False()))
+ UniqueValues.insert(Blend->getIncomingValue(I));
+
+ if (UniqueValues.size() == 1) {
+ Blend->replaceAllUsesWith(*UniqueValues.begin());
+ Blend->eraseFromParent();
+ return;
+ }
+
+ if (Blend->isNormalized())
+ return;
+
+ // Normalize the blend so its first incoming value is used as the initial
+ // value with the others blended into it.
+
+ unsigned StartIndex = 0;
+ for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+ // If a value's mask is used only by the blend then is can be deadcoded.
+ // TODO: Find the most expensive mask that can be deadcoded, or a mask
+ // that's used by multiple blends where it can be removed from them all.
+ VPValue *Mask = Blend->getMask(I);
+ if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
+ StartIndex = I;
+ break;
+ }
+ }
+
+ SmallVector<VPValue *, 4> OperandsWithMask;
+ OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
+
+ for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+ if (I == StartIndex)
+ continue;
+ OperandsWithMask.push_back(Blend->getIncomingValue(I));
+ OperandsWithMask.push_back(Blend->getMask(I));
+ }
+
+ auto *NewBlend = new VPBlendRecipe(
+ cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
+ NewBlend->insertBefore(&R);
+
+ VPValue *DeadMask = Blend->getMask(StartIndex);
+ Blend->replaceAllUsesWith(NewBlend);
+ Blend->eraseFromParent();
+ recursivelyDeleteDeadRecipes(DeadMask);
+ return;
+ }
+
+ VPValue *A;
+ if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
+ VPValue *Trunc = R.getVPSingleValue();
+ Type *TruncTy = TypeInfo.inferScalarType(Trunc);
+ Type *ATy = TypeInfo.inferScalarType(A);
+ if (TruncTy == ATy) {
+ Trunc->replaceAllUsesWith(A);
+ } else {
+ // Don't replace a scalarizing recipe with a widened cast.
+ if (isa<VPReplicateRecipe>(&R))
+ return;
+ if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+
+ unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
+ ? Instruction::SExt
+ : Instruction::ZExt;
+ auto *VPC =
+ new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+ if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
+ // UnderlyingExt has distinct return type, used to retain legacy cost.
+ VPC->setUnderlyingValue(UnderlyingExt);
+ }
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
+ auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ }
+ }
+#ifndef NDEBUG
+ // Verify that the cached type info is for both A and its users is still
+ // accurate by comparing it to freshly computed types.
+ VPTypeAnalysis TypeInfo2(
+ R.getParent()->getPlan()->getCanonicalIV()->getScalarType());
+ assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
+ for (VPUser *U : A->users()) {
+ auto *R = cast<VPRecipeBase>(U);
+ for (VPValue *VPV : R->definedValues())
+ assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
+ }
+#endif
+ }
+
+ // Simplify (X && Y) || (X && !Y) -> X.
+ // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
+ // && (Y || Z) and (X || !X) into true. This requires queuing newly created
+ // recipes to be visited during simplification.
+ VPValue *X, *Y, *X1, *Y1;
+ if (match(&R,
+ m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+ m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
+ X == X1 && Y == Y1) {
+ R.getVPSingleValue()->replaceAllUsesWith(X);
+ R.eraseFromParent();
+ return;
+ }
+
+ if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
+ return R.getVPSingleValue()->replaceAllUsesWith(A);
+
+ if (match(&R, m_Not(m_Not(m_VPValue(A)))))
+ return R.getVPSingleValue()->replaceAllUsesWith(A);
+
+ // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
+ if ((match(&R,
+ m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
+ match(&R,
+ m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
+ TypeInfo.inferScalarType(R.getOperand(1)) ==
+ TypeInfo.inferScalarType(R.getVPSingleValue()))
+ return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1));
+}
+
+/// Try to simplify the recipes in \p Plan
+static void simplifyRecipes(VPlan &Plan) {
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+ VPTypeAnalysis TypeInfo(CanonicalIVType);
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ simplifyRecipe(R, TypeInfo);
+ }
+ }
+}
+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
@@ -696,11 +842,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
VPInstruction::BranchOnCond,
{Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
- SmallVector<VPValue *> PossiblyDead(Term->operands());
Term->eraseFromParent();
- for (VPValue *Op : PossiblyDead)
- recursivelyDeleteDeadRecipes(Op);
ExitingVPBB->appendRecipe(BOC);
+
+ VPlanTransforms::removeDeadRecipes(Plan);
+
Plan.setVF(BestVF);
Plan.setUF(BestUF);
// TODO: Further simplifications are possible
@@ -941,126 +1087,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
}
}
-/// Try to simplify recipe \p R.
-static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
- using namespace llvm::VPlanPatternMatch;
-
- if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
- // Try to remove redundant blend recipes.
- SmallPtrSet<VPValue *, 4> UniqueValues;
- if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
- UniqueValues.insert(Blend->getIncomingValue(0));
- for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
- if (!match(Blend->getMask(I), m_False()))
- UniqueValues.insert(Blend->getIncomingValue(I));
-
- if (UniqueValues.size() == 1) {
- Blend->replaceAllUsesWith(*UniqueValues.begin());
- Blend->eraseFromParent();
- return;
- }
-
- if (Blend->isNormalized())
- return;
-
- // Normalize the blend so its first incoming value is used as the initial
- // value with the others blended into it.
-
- unsigned StartIndex = 0;
- for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
- // If a value's mask is used only by the blend then is can be deadcoded.
- // TODO: Find the most expensive mask that can be deadcoded, or a mask
- // that's used by multiple blends where it can be removed from them all.
- VPValue *Mask = Blend->getMask(I);
- if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
- StartIndex = I;
- break;
- }
- }
-
- SmallVector<VPValue *, 4> OperandsWithMask;
- OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
-
- for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
- if (I == StartIndex)
- continue;
- OperandsWithMask.push_back(Blend->getIncomingValue(I));
- OperandsWithMask.push_back(Blend->getMask(I));
- }
-
- auto *NewBlend = new VPBlendRecipe(
- cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
- NewBlend->insertBefore(&R);
-
- VPValue *DeadMask = Blend->getMask(StartIndex);
- Blend->replaceAllUsesWith(NewBlend);
- Blend->eraseFromParent();
- recursivelyDeleteDeadRecipes(DeadMask);
- return;
- }
-
- VPValue *A;
- if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
- VPValue *Trunc = R.getVPSingleValue();
- Type *TruncTy = TypeInfo.inferScalarType(Trunc);
- Type *ATy = TypeInfo.inferScalarType(A);
- if (TruncTy == ATy) {
- Trunc->replaceAllUsesWith(A);
- } else {
- // Don't replace a scalarizing recipe with a widened cast.
- if (isa<VPReplicateRecipe>(&R))
- return;
- if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
-
- unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
- ? Instruction::SExt
- : Instruction::ZExt;
- auto *VPC =
- new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
- if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
- // UnderlyingExt has distinct return type, used to retain legacy cost.
- VPC->setUnderlyingValue(UnderlyingExt);
- }
- VPC->insertBefore(&R);
- Trunc->replaceAllUsesWith(VPC);
- } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
- auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
- VPC->insertBefore(&R);
- Trunc->replaceAllUsesWith(VPC);
- }
- }
-#ifndef NDEBUG
- // Verify that the cached type info is for both A and its users is still
- // accurate by comparing it to freshly computed types.
- VPTypeAnalysis TypeInfo2(
- R.getParent()->getPlan()->getCanonicalIV()->getScalarType());
- assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
- for (VPUser *U : A->users()) {
- auto *R = cast<VPRecipeBase>(U);
- for (VPValue *VPV : R->definedValues())
- assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
- }
-#endif
- }
-
- // Simplify (X && Y) || (X && !Y) -> X.
- // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
- // && (Y || Z) and (X || !X) into true. This requires queuing newly created
- // recipes to be visited during simplification.
- VPValue *X, *Y, *X1, *Y1;
- if (match(&R,
- m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
- m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
- X == X1 && Y == Y1) {
- R.getVPSingleValue()->replaceAllUsesWith(X);
- R.eraseFromParent();
- return;
- }
-
- if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
- return R.getVPSingleValue()->replaceAllUsesWith(A);
-}
-
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
static void licm(VPlan &Plan) {
VPBasicBlock *Preheader = Plan.getVectorPreheader();
@@ -1095,19 +1121,6 @@ static void licm(VPlan &Plan) {
}
}
-/// Try to simplify the recipes in \p Plan.
-static void simplifyRecipes(VPlan &Plan) {
- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
- Plan.getEntry());
- Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
- VPTypeAnalysis TypeInfo(CanonicalIVType);
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
- for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- simplifyRecipe(R, TypeInfo);
- }
- }
-}
-
void VPlanTransforms::truncateToMinimalBitwidths(
VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
#ifndef NDEBUG
@@ -1247,11 +1260,11 @@ void VPlanTransforms::optimize(VPlan &Plan) {
simplifyRecipes(Plan);
legalizeAndOptimizeInductions(Plan);
+ removeRedundantExpandSCEVRecipes(Plan);
+ simplifyRecipes(Plan);
removeDeadRecipes(Plan);
createAndOptimizeReplicateRegions(Plan);
-
- removeRedundantExpandSCEVRecipes(Plan);
mergeBlocksIntoPredecessors(Plan);
licm(Plan);
}
@@ -1438,112 +1451,134 @@ void VPlanTransforms::addActiveLaneMask(
HeaderMask->replaceAllUsesWith(LaneMask);
}
+/// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
+/// nullptr if no EVL-based recipe could be created.
+/// \p HeaderMask Header Mask.
+/// \p CurRecipe Recipe to be transform.
+/// \p TypeInfo VPlan-based type analysis.
+/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
+/// \p EVL The explicit vector length parameter of vector-predication
+/// intrinsics.
+static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
+ VPRecipeBase &CurRecipe,
+ VPTypeAnalysis &TypeInfo,
+ VPValue &AllOneMask, VPValue &EVL) {
+ using namespace llvm::VPlanPatternMatch;
+ auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
+ assert(OrigMask && "Unmasked recipe when folding tail");
+ return HeaderMask == OrigMask ? nullptr : OrigMask;
+ };
+
+ return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
+ .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
+ VPValue *NewMask = GetNewMask(L->getMask());
+ return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
+ })
+ .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
+ VPValue *NewMask = GetNewMask(S->getMask());
+ return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
+ })
+ .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * {
+ unsigned Opcode = W->getOpcode();
+ if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
+ return nullptr;
+ return new VPWidenEVLRecipe(*W, EVL);
+ })
+ .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
+ VPValue *NewMask = GetNewMask(Red->getCondOp());
+ return new VPReductionEVLRecipe(*Red, EVL, NewMask);
+ })
+ .Case<VPWidenIntrinsicRecipe, VPWidenCastRecipe>(
+ [&](auto *CR) -> VPRecipeBase * {
+ Intrinsic::ID VPID;
+ if (auto *CallR = dyn_cast<VPWidenIntrinsicRecipe>(CR)) {
+ VPID =
+ VPIntrinsic::getForIntrinsic(CallR->getVectorIntrinsicID());
+ } else {
+ auto *CastR = cast<VPWidenCastRecipe>(CR);
+ VPID = VPIntrinsic::getForOpcode(CastR->getOpcode());
+ }
+ assert(VPID != Intrinsic::not_intrinsic && "Expected VP intrinsic");
+ assert(VPIntrinsic::getMaskParamPos(VPID) &&
+ VPIntrinsic::getVectorLengthParamPos(VPID) &&
+ "Expected VP intrinsic");
+
+ SmallVector<VPValue *> Ops(CR->operands());
+ Ops.push_back(&AllOneMask);
+ Ops.push_back(&EVL);
+ return new VPWidenIntrinsicRecipe(
+ VPID, Ops, TypeInfo.inferScalarType(CR), CR->getDebugLoc());
+ })
+ .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
+ SmallVector<VPValue *> Ops(Sel->operands());
+ Ops.push_back(&EVL);
+ return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
+ TypeInfo.inferScalarType(Sel),
+ Sel->getDebugLoc());
+ })
+ .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
+ VPValue *LHS, *RHS;
+ // Transform select with a header mask condition
+ // select(header_mask, LHS, RHS)
+ // into vector predication merge.
+ // vp.merge(all-true, LHS, RHS, EVL)
+ if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
+ m_VPValue(RHS))))
+ return nullptr;
+ // Use all true as the condition because this transformation is
+ // limited to selects whose condition is a header mask.
+ return new VPWidenIntrinsicRecipe(
+ Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
+ TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
+ })
+ .Default([&](VPRecipeBase *R) { return nullptr; });
+}
+
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
- using namespace llvm::VPlanPatternMatch;
Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
VPTypeAnalysis TypeInfo(CanonicalIVType);
LLVMContext &Ctx = CanonicalIVType->getContext();
- SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+ VPValue *AllOneMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
for (VPUser *U : Plan.getVF().users()) {
if (auto *R = dyn_cast<VPReverseVectorPointerRecipe>(U))
R->setOperand(1, &EVL);
}
+ SmallVector<VPRecipeBase *> ToErase;
+
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
auto *CurRecipe = cast<VPRecipeBase>(U);
- auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
- assert(OrigMask && "Unmasked recipe when folding tail");
- return HeaderMask == OrigMask ? nullptr : OrigMask;
- };
-
- VPRecipeBase *NewRecipe =
- TypeSwitch<VPRecipeBase *, VPRecipeBase *>(CurRecipe)
- .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
- VPValue *NewMask = GetNewMask(L->getMask());
- return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
- })
- .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
- VPValue *NewMask = GetNewMask(S->getMask());
- return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
- })
- .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * {
- unsigned Opcode = W->getOpcode();
- if (!Instruction::isBinaryOp(Opcode) &&
- !Instruction::isUnaryOp(Opcode))
- return nullptr;
- return new VPWidenEVLRecipe(*W, EVL);
- })
- .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
- VPValue *NewMask = GetNewMask(Red->getCondOp());
- return new VPReductionEVLRecipe(*Red, EVL, NewMask);
- })
- .Case<VPWidenIntrinsicRecipe>(
- [&](VPWidenIntrinsicRecipe *CInst) -> VPRecipeBase * {
- auto *CI = cast<CallInst>(CInst->getUnderlyingInstr());
- Intrinsic::ID VPID = VPIntrinsic::getForIntrinsic(
- CI->getCalledFunction()->getIntrinsicID());
- if (VPID == Intrinsic::not_intrinsic)
- return nullptr;
-
- SmallVector<VPValue *> Ops(CInst->operands());
- assert(VPIntrinsic::getMaskParamPos(VPID) &&
- VPIntrinsic::getVectorLengthParamPos(VPID) &&
- "Expected VP intrinsic");
- VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(
- IntegerType::getInt1Ty(CI->getContext())));
- Ops.push_back(Mask);
- Ops.push_back(&EVL);
- return new VPWidenIntrinsicRecipe(
- *CI, VPID, Ops, TypeInfo.inferScalarType(CInst),
- CInst->getDebugLoc());
- })
- .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
- SmallVector<VPValue *> Ops(Sel->operands());
- Ops.push_back(&EVL);
- return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
- TypeInfo.inferScalarType(Sel),
- Sel->getDebugLoc());
- })
- .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
- VPValue *LHS, *RHS;
- // Transform select with a header mask condition
- // select(header_mask, LHS, RHS)
- // into vector predication merge.
- // vp.merge(all-true, LHS, RHS, EVL)
- if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
- m_VPValue(RHS))))
- return nullptr;
- // Use all true as the condition because this transformation is
- // limited to selects whose condition is a header mask.
- VPValue *AllTrue =
- Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
- return new VPWidenIntrinsicRecipe(
- Intrinsic::vp_merge, {AllTrue, LHS, RHS, &EVL},
- TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
- })
- .Default([&](VPRecipeBase *R) { return nullptr; });
-
- if (!NewRecipe)
+ VPRecipeBase *EVLRecipe =
+ createEVLRecipe(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
+ if (!EVLRecipe)
continue;
- [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues();
+ [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
"New recipe must define the same number of values as the "
"original.");
assert(
NumDefVal <= 1 &&
"Only supports recipes with a single definition or without users.");
- NewRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(NewRecipe)) {
+ EVLRecipe->insertBefore(CurRecipe);
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
VPValue *CurVPV = CurRecipe->getVPSingleValue();
- CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
+ CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
}
- CurRecipe->eraseFromParent();
+ // Defer erasing recipes till the end so that we don't invalidate the
+ // VPTypeAnalysis cache.
+ ToErase.push_back(CurRecipe);
}
- recursivelyDeleteDeadRecipes(HeaderMask);
+ }
+
+ for (VPRecipeBase *R : reverse(ToErase)) {
+ SmallVector<VPValue *> PossiblyDead(R->operands());
+ R->eraseFromParent();
+ for (VPValue *Op : PossiblyDead)
+ recursivelyDeleteDeadRecipes(Op);
}
}
@@ -1667,8 +1702,8 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
// instruction. Widen memory instructions involved in address computation
// will lead to gather/scatter instructions, which don't need to be
// handled.
- if (isa<VPWidenMemoryRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) ||
- isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec))
+ if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
+ VPHeaderPHIRecipe>(CurRec))
continue;
// This recipe contributes to the address computation of a widen
@@ -1820,9 +1855,7 @@ void VPlanTransforms::createInterleaveGroups(
}
}
-void VPlanTransforms::prepareToExecute(VPlan &Plan) {
- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
- Plan.getVectorLoopRegion());
+void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) {
@@ -1840,3 +1873,62 @@ void VPlanTransforms::prepareToExecute(VPlan &Plan) {
}
}
}
+
+void VPlanTransforms::handleUncountableEarlyExit(
+ VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
+ BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) {
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ auto *LatchVPBB = cast<VPBasicBlock>(LoopRegion->getExiting());
+ VPBuilder Builder(LatchVPBB->getTerminator());
+ auto *MiddleVPBB = Plan.getMiddleBlock();
+ VPValue *IsEarlyExitTaken = nullptr;
+
+ // Process the uncountable exiting block. Update IsEarlyExitTaken, which
+ // tracks if the uncountable early exit has been taken. Also split the middle
+ // block and have it conditionally branch to the early exit block if
+ // EarlyExitTaken.
+ auto *EarlyExitingBranch =
+ cast<BranchInst>(UncountableExitingBlock->getTerminator());
+ BasicBlock *TrueSucc = EarlyExitingBranch->getSuccessor(0);
+ BasicBlock *FalseSucc = EarlyExitingBranch->getSuccessor(1);
+
+ // The early exit block may or may not be the same as the "countable" exit
+ // block. Creates a new VPIRBB for the early exit block in case it is distinct
+ // from the countable exit block.
+ // TODO: Introduce both exit blocks during VPlan skeleton construction.
+ VPIRBasicBlock *VPEarlyExitBlock;
+ if (OrigLoop->getUniqueExitBlock()) {
+ VPEarlyExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+ } else {
+ VPEarlyExitBlock = Plan.createVPIRBasicBlock(
+ !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+ }
+
+ VPValue *EarlyExitNotTakenCond = RecipeBuilder.getBlockInMask(
+ OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+ auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond);
+ IsEarlyExitTaken =
+ Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
+
+ VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
+ VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
+ VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
+ NewMiddle->swapSuccessors();
+
+ VPBuilder MiddleBuilder(NewMiddle);
+ MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
+
+ // Replace the condition controlling the non-early exit from the vector loop
+ // with one exiting if either the original condition of the vector latch is
+ // true or the early exit has been taken.
+ auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
+ assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
+ "Unexpected terminator");
+ auto *IsLatchExitTaken =
+ Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
+ LatchExitingBranch->getOperand(1));
+ auto *AnyExitTaken = Builder.createNaryOp(
+ Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
+ Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
+ LatchExitingBranch->eraseFromParent();
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 1491e0a8df04..fddde8689116 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -124,8 +124,19 @@ struct VPlanTransforms {
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
+ /// Update \p Plan to account for the uncountable early exit block in \p
+ /// UncountableExitingBlock by
+ /// * updating the condition exiting the vector loop to include the early
+ /// exit conditions
+ /// * splitting the original middle block to branch to the early exit block
+ /// if taken.
+ static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
+ Loop *OrigLoop,
+ BasicBlock *UncountableExitingBlock,
+ VPRecipeBuilder &RecipeBuilder);
+
/// Lower abstract recipes to concrete ones, that can be codegen'd.
- static void prepareToExecute(VPlan &Plan);
+ static void convertToConcreteRecipes(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index f653269713b3..89e372d6b46c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -168,7 +168,7 @@ void UnrollState::unrollWidenInductionByUF(
auto *ConstStep = ScalarStep->isLiveIn()
? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
: nullptr;
- if (!ConstStep || ConstStep->getZExtValue() != 1) {
+ if (!ConstStep || ConstStep->getValue() != 1) {
if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
ScalarStep =
Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
@@ -412,8 +412,6 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
UnrollState Unroller(Plan, UF, Ctx);
- Unroller.unrollBlock(Plan.getPreheader());
-
// Iterate over all blocks in the plan starting from Entry, and unroll
// recipes inside them. This includes the vector preheader and middle blocks,
// which may set up or post-process per-part values.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4621c28b0512..e40af3e2e3d3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -34,7 +34,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
Expanded = Plan.getOrAddLiveIn(E->getValue());
else {
Expanded = new VPExpandSCEVRecipe(Expr, SE);
- Plan.getPreheader()->appendRecipe(Expanded->getDefiningRecipe());
+ Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());
}
Plan.addSCEVExpansion(Expr, Expanded);
return Expanded;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 71c7d547ac7d..be420a873bef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -185,7 +185,7 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
RecipeNumbering[&R] = Cnt++;
for (const VPRecipeBase &R : *VPBB) {
- if (isa<VPIRInstruction>(&R) ^ isa<VPIRBasicBlock>(VPBB)) {
+ if (isa<VPIRInstruction>(&R) && !isa<VPIRBasicBlock>(VPBB)) {
errs() << "VPIRInstructions ";
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
R.dump();
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b9caf8c0df9b..493ed95b1d22 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -67,9 +67,10 @@ class VectorCombine {
public:
VectorCombine(Function &F, const TargetTransformInfo &TTI,
const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
- const DataLayout *DL, bool TryEarlyFoldsOnly)
+ const DataLayout *DL, TTI::TargetCostKind CostKind,
+ bool TryEarlyFoldsOnly)
: F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL),
- TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
+ CostKind(CostKind), TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
bool run();
@@ -81,6 +82,7 @@ private:
AAResults &AA;
AssumptionCache &AC;
const DataLayout *DL;
+ TTI::TargetCostKind CostKind;
/// If true, only perform beneficial early IR transforms. Do not introduce new
/// vector operations.
@@ -113,6 +115,7 @@ private:
bool foldExtractedCmps(Instruction &I);
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
+ bool foldConcatOfBoolMasks(Instruction &I);
bool foldPermuteOfBinops(Instruction &I);
bool foldShuffleOfBinops(Instruction &I);
bool foldShuffleOfCastops(Instruction &I);
@@ -125,6 +128,8 @@ private:
bool shrinkType(Instruction &I);
void replaceValue(Value &Old, Value &New) {
+ LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
+ LLVM_DEBUG(dbgs() << " With: " << New << '\n');
Old.replaceAllUsesWith(&New);
if (auto *NewI = dyn_cast<Instruction>(&New)) {
New.takeName(&Old);
@@ -135,10 +140,18 @@ private:
}
void eraseInstruction(Instruction &I) {
- for (Value *Op : I.operands())
- Worklist.pushValue(Op);
+ LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
+ SmallVector<Value *> Ops(I.operands());
Worklist.remove(&I);
I.eraseFromParent();
+
+ // Push remaining users of the operands and then the operand itself - allows
+ // further folds that were hindered by OneUse limits.
+ for (Value *Op : Ops)
+ if (auto *OpI = dyn_cast<Instruction>(Op)) {
+ Worklist.pushUsersToWorkList(*OpI);
+ Worklist.pushValue(OpI);
+ }
}
};
} // namespace
@@ -176,8 +189,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// Match insert into fixed vector of scalar value.
// TODO: Handle non-zero insert index.
Value *Scalar;
- if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
- !Scalar->hasOneUse())
+ if (!match(&I,
+ m_InsertElt(m_Poison(), m_OneUse(m_Value(Scalar)), m_ZeroInt())))
return false;
// Optionally match an extract from another vector.
@@ -247,16 +260,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
Type *LoadTy = Load->getType();
unsigned AS = Load->getPointerAddressSpace();
InstructionCost OldCost =
- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
OldCost +=
TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
/* Insert */ true, HasExtract, CostKind);
// New pattern: load VecPtr
InstructionCost NewCost =
- TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
+ TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
// Optionally, we are shuffling the loaded vector element(s) into place.
// For the mask set everything but element 0 to undef to prevent poison from
// propagating from the extra loaded memory. This will also optionally
@@ -270,7 +282,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
Mask[0] = OffsetEltIndex;
if (OffsetEltIndex)
- NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask);
+ NewCost +=
+ TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind);
// We can aggressively convert to the vector form because the backend can
// invert this transform if it does not result in a performance win.
@@ -329,11 +342,11 @@ bool VectorCombine::widenSubvectorLoad(Instruction &I) {
// undef value is 0. We could add that cost if the cost model accurately
// reflects the real cost of that operation.
InstructionCost OldCost =
- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
// New pattern: load PtrOp
InstructionCost NewCost =
- TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS);
+ TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
// We can aggressively convert to the vector form because the backend can
// invert this transform if it does not result in a performance win.
@@ -366,7 +379,6 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
return nullptr;
Type *VecTy = Ext0->getVectorOperand()->getType();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
InstructionCost Cost0 =
TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
@@ -420,23 +432,22 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
// Get cost estimates for scalar and vector versions of the operation.
bool IsBinOp = Instruction::isBinaryOp(Opcode);
if (IsBinOp) {
- ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
- VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+ ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
+ VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
} else {
assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
"Expected a compare");
CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
ScalarOpCost = TTI.getCmpSelInstrCost(
- Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+ Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
VectorOpCost = TTI.getCmpSelInstrCost(
- Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
+ Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
}
// Get cost estimates for the extract elements. These costs will factor into
// both sequences.
unsigned Ext0Index = Ext0IndexC->getZExtValue();
unsigned Ext1Index = Ext1IndexC->getZExtValue();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Extract0Cost =
TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
@@ -596,7 +607,7 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
return false;
Instruction *I0, *I1;
- CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
!match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
return false;
@@ -665,9 +676,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
return false;
- // TODO: We could handle this with a length-changing shuffle.
auto *VecTy = cast<FixedVectorType>(I.getType());
- if (SrcVec->getType() != VecTy)
+ auto *ScalarTy = VecTy->getScalarType();
+ auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
+ if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
return false;
// Ignore bogus insert/extract index.
@@ -681,11 +693,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
SmallVector<int> Mask(NumElts);
std::iota(Mask.begin(), Mask.end(), 0);
Mask[Index] = Index + NumElts;
-
- Type *ScalarTy = VecTy->getScalarType();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +
+ TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
// If the extract has one use, it will be eliminated, so count it in the
@@ -695,17 +704,36 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
InstructionCost NewCost =
- TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
+ TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind);
+
+ bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
+ // If the lengths of the two vectors are not equal,
+ // we need to add a length-change vector. Add this cost.
+ SmallVector<int> SrcMask;
+ if (NeedLenChg) {
+ SrcMask.assign(NumElts, PoisonMaskElem);
+ SrcMask[Index] = Index;
+ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ SrcVecTy, SrcMask, CostKind);
+ }
if (NewCost > OldCost)
return false;
- // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
- // shuffle DestVec, (fneg SrcVec), Mask
+ Value *NewShuf;
+ // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
- Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
- replaceValue(I, *Shuf);
+ if (NeedLenChg) {
+ // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
+ Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
+ NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
+ } else {
+ // shuffle DestVec, (fneg SrcVec), Mask
+ NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
+ }
+
+ replaceValue(I, *NewShuf);
return true;
}
@@ -772,22 +800,25 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) {
unsigned NumOps = IsUnary ? 1 : 2;
// The new shuffle must not cost more than the old shuffle.
- TargetTransformInfo::TargetCostKind CK =
- TargetTransformInfo::TCK_RecipThroughput;
TargetTransformInfo::ShuffleKind SK =
IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
: TargetTransformInfo::SK_PermuteTwoSrc;
- InstructionCost DestCost =
- TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) +
+ InstructionCost NewCost =
+ TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CostKind) +
(NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
TargetTransformInfo::CastContextHint::None,
- CK));
- InstructionCost SrcCost =
- TTI.getShuffleCost(SK, SrcTy, Mask, CK) +
+ CostKind));
+ InstructionCost OldCost =
+ TTI.getShuffleCost(SK, SrcTy, Mask, CostKind) +
TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
- TargetTransformInfo::CastContextHint::None, CK);
- if (DestCost > SrcCost || !DestCost.isValid())
+ TargetTransformInfo::CastContextHint::None,
+ CostKind);
+
+ LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
+ << OldCost << " vs NewCost: " << NewCost << "\n");
+
+ if (NewCost > OldCost || !NewCost.isValid())
return false;
// bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
@@ -841,13 +872,13 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
// Calculate cost of splatting both operands into vectors and the vector
// intrinsic
VectorType *VecTy = cast<VectorType>(VPI.getType());
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
SmallVector<int> Mask;
if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
Mask.resize(FVTy->getNumElements(), 0);
InstructionCost SplatCost =
TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask);
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask,
+ CostKind);
// Calculate the cost of the VP Intrinsic
SmallVector<Type *, 4> Args;
@@ -873,8 +904,8 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
} else {
- ScalarOpCost =
- TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType());
+ ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
+ VecTy->getScalarType(), CostKind);
}
// The existing splats may be kept around if other instructions use them.
@@ -924,7 +955,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
/// Match a vector binop or compare instruction with at least one inserted
/// scalar operand and convert to scalar binop/cmp followed by insertelement.
bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
- CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
Value *Ins0, *Ins1;
if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
!match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
@@ -993,17 +1024,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
if (IsCmp) {
CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
ScalarOpCost = TTI.getCmpSelInstrCost(
- Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+ Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
VectorOpCost = TTI.getCmpSelInstrCost(
- Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
+ Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
} else {
- ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
- VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+ ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
+ VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
}
// Get cost estimate for the insert element. This cost will factor into
// both sequences.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost InsertCost = TTI.getVectorInstrCost(
Instruction::InsertElement, VecTy, CostKind, Index);
InstructionCost OldCost =
@@ -1065,9 +1095,11 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
Instruction *I0, *I1;
Constant *C0, *C1;
- CmpInst::Predicate P0, P1;
+ CmpPredicate P0, P1;
+ // FIXME: Use CmpPredicate::getMatching here.
if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
- !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) || P0 != P1)
+ !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) ||
+ P0 != static_cast<CmpInst::Predicate>(P1))
return false;
// The compare operands must be extracts of the same vector with constant
@@ -1080,7 +1112,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
auto *Ext0 = cast<ExtractElementInst>(I0);
auto *Ext1 = cast<ExtractElementInst>(I1);
- ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
+ ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
if (!ConvertToShuf)
return false;
assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
@@ -1089,23 +1121,23 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
// The original scalar pattern is:
// binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
CmpInst::Predicate Pred = P0;
- unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
- : Instruction::ICmp;
+ unsigned CmpOpcode =
+ CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
if (!VecTy)
return false;
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Ext0Cost =
- TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0),
- Ext1Cost =
- TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
+ TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
+ InstructionCost Ext1Cost =
+ TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
+ InstructionCost CmpCost = TTI.getCmpSelInstrCost(
+ CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
+ CostKind);
+
InstructionCost OldCost =
- Ext0Cost + Ext1Cost +
- TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
- CmpInst::makeCmpResultType(I0->getType()), Pred) *
- 2 +
- TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
+ Ext0Cost + Ext1Cost + CmpCost * 2 +
+ TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
// The proposed vector pattern is:
// vcmp = cmp Pred X, VecC
@@ -1114,12 +1146,13 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
InstructionCost NewCost = TTI.getCmpSelInstrCost(
- CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred);
+ CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred,
+ CostKind);
SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
ShufMask[CheapIndex] = ExpensiveIndex;
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
- ShufMask);
- NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
+ ShufMask, CostKind);
+ NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
@@ -1311,6 +1344,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
MemoryLocation::get(SI), AA))
return false;
+ // Ensure we add the load back to the worklist BEFORE its users so they can
+ // erased in the correct order.
+ Worklist.push(Load);
+
if (ScalarizableIdx.isSafeWithFreeze())
ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
Value *GEP = Builder.CreateInBoundsGEP(
@@ -1336,14 +1373,14 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
if (!match(&I, m_Load(m_Value(Ptr))))
return false;
- auto *VecTy = cast<VectorType>(I.getType());
auto *LI = cast<LoadInst>(&I);
+ auto *VecTy = cast<VectorType>(LI->getType());
if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
return false;
InstructionCost OriginalCost =
TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
- LI->getPointerAddressSpace());
+ LI->getPointerAddressSpace(), CostKind);
InstructionCost ScalarizedCost = 0;
Instruction *LastCheckedInst = LI;
@@ -1377,7 +1414,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
LastCheckedInst = UI;
}
- auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT);
+ auto ScalarIdx =
+ canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
if (ScalarIdx.isUnsafe())
return false;
if (ScalarIdx.isSafeWithFreeze()) {
@@ -1385,24 +1423,27 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
ScalarIdx.discard();
}
- auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
OriginalCost +=
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
Index ? Index->getZExtValue() : -1);
ScalarizedCost +=
TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
- Align(1), LI->getPointerAddressSpace());
+ Align(1), LI->getPointerAddressSpace(), CostKind);
ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType());
}
if (ScalarizedCost >= OriginalCost)
return false;
+ // Ensure we add the load back to the worklist BEFORE its users so they can
+ // erased in the correct order.
+ Worklist.push(LI);
+
// Replace extracts with narrow scalar loads.
for (User *U : LI->users()) {
auto *EI = cast<ExtractElementInst>(U);
- Value *Idx = EI->getOperand(1);
+ Value *Idx = EI->getIndexOperand();
// Insert 'freeze' for poison indexes.
auto It = NeedFreeze.find(EI);
@@ -1426,6 +1467,117 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
return true;
}
+/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
+/// to "(bitcast (concat X, Y))"
+/// where X/Y are bitcasted from i1 mask vectors.
+bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
+ Type *Ty = I.getType();
+ if (!Ty->isIntegerTy())
+ return false;
+
+ // TODO: Add big endian test coverage
+ if (DL->isBigEndian())
+ return false;
+
+ // Restrict to disjoint cases so the mask vectors aren't overlapping.
+ Instruction *X, *Y;
+ if (!match(&I, m_DisjointOr(m_Instruction(X), m_Instruction(Y))))
+ return false;
+
+ // Allow both sources to contain shl, to handle more generic pattern:
+ // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
+ Value *SrcX;
+ uint64_t ShAmtX = 0;
+ if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
+ !match(X, m_OneUse(
+ m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX))))),
+ m_ConstantInt(ShAmtX)))))
+ return false;
+
+ Value *SrcY;
+ uint64_t ShAmtY = 0;
+ if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
+ !match(Y, m_OneUse(
+ m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY))))),
+ m_ConstantInt(ShAmtY)))))
+ return false;
+
+ // Canonicalize larger shift to the RHS.
+ if (ShAmtX > ShAmtY) {
+ std::swap(X, Y);
+ std::swap(SrcX, SrcY);
+ std::swap(ShAmtX, ShAmtY);
+ }
+
+ // Ensure both sources are matching vXi1 bool mask types, and that the shift
+ // difference is the mask width so they can be easily concatenated together.
+ uint64_t ShAmtDiff = ShAmtY - ShAmtX;
+ unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
+ unsigned BitWidth = Ty->getPrimitiveSizeInBits();
+ auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
+ if (!MaskTy || SrcX->getType() != SrcY->getType() ||
+ !MaskTy->getElementType()->isIntegerTy(1) ||
+ MaskTy->getNumElements() != ShAmtDiff ||
+ MaskTy->getNumElements() > (BitWidth / 2))
+ return false;
+
+ auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
+ auto *ConcatIntTy =
+ Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
+ auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
+
+ SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+
+ // TODO: Is it worth supporting multi use cases?
+ InstructionCost OldCost = 0;
+ OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
+ OldCost +=
+ NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
+ OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
+ TTI::CastContextHint::None, CostKind);
+ OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
+ TTI::CastContextHint::None, CostKind);
+
+ InstructionCost NewCost = 0;
+ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, MaskTy,
+ ConcatMask, CostKind);
+ NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
+ TTI::CastContextHint::None, CostKind);
+ if (Ty != ConcatIntTy)
+ NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
+ TTI::CastContextHint::None, CostKind);
+ if (ShAmtX > 0)
+ NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
+
+ LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
+ << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
+
+ if (NewCost > OldCost)
+ return false;
+
+ // Build bool mask concatenation, bitcast back to scalar integer, and perform
+ // any residual zero-extension or shifting.
+ Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
+ Worklist.pushValue(Concat);
+
+ Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
+
+ if (Ty != ConcatIntTy) {
+ Worklist.pushValue(Result);
+ Result = Builder.CreateZExt(Result, Ty);
+ }
+
+ if (ShAmtX > 0) {
+ Worklist.pushValue(Result);
+ Result = Builder.CreateShl(Result, ShAmtX);
+ }
+
+ replaceValue(I, *Result);
+ return true;
+}
+
/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
/// --> "binop (shuffle), (shuffle)".
bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
@@ -1480,8 +1632,6 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
}
// Try to merge shuffles across the binop if the new shuffles are not costly.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
@@ -1523,34 +1673,46 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
}
/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
+/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
- BinaryOperator *B0, *B1;
ArrayRef<int> OldMask;
- if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
- m_Mask(OldMask))))
- return false;
-
- // Don't introduce poison into div/rem.
- if (llvm::is_contained(OldMask, PoisonMaskElem) && B0->isIntDivRem())
+ Instruction *LHS, *RHS;
+ if (!match(&I, m_Shuffle(m_OneUse(m_Instruction(LHS)),
+ m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
return false;
// TODO: Add support for addlike etc.
- Instruction::BinaryOps Opcode = B0->getOpcode();
- if (Opcode != B1->getOpcode())
+ if (LHS->getOpcode() != RHS->getOpcode())
+ return false;
+
+ Value *X, *Y, *Z, *W;
+ bool IsCommutative = false;
+ CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
+ CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
+ if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
+ match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
+ auto *BO = cast<BinaryOperator>(LHS);
+ // Don't introduce poison into div/rem.
+ if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
+ return false;
+ IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
+ } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
+ match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
+ (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
+ IsCommutative = cast<CmpInst>(LHS)->isCommutative();
+ } else
return false;
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
- auto *BinOpTy = dyn_cast<FixedVectorType>(B0->getType());
- if (!ShuffleDstTy || !BinOpTy)
+ auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
+ auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
+ if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
return false;
unsigned NumSrcElts = BinOpTy->getNumElements();
// If we have something like "add X, Y" and "add Z, X", swap ops to match.
- Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
- Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
- if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
- (X == W || Y == Z))
+ if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
std::swap(X, Y);
auto ConvertToUnary = [NumSrcElts](int &M) {
@@ -1575,33 +1737,48 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
}
// Try to replace a binop with a shuffle if the shuffle is not costly.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy, CostKind) +
- TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy, CostKind) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
- OldMask, CostKind, 0, nullptr, {B0, B1}, &I);
+ TTI.getInstructionCost(LHS, CostKind) +
+ TTI.getInstructionCost(RHS, CostKind) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
+ OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
InstructionCost NewCost =
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
- TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) +
- TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
+ TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
+
+ if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
+ NewCost +=
+ TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
+ } else {
+ auto *ShuffleCmpTy =
+ FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
+ NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
+ ShuffleDstTy, PredLHS, CostKind);
+ }
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
<< "\n");
- if (NewCost >= OldCost)
+
+ // If either shuffle will constant fold away, then fold for the same cost as
+ // we will reduce the instruction count.
+ bool ReducedInstCount = (isa<Constant>(X) && isa<Constant>(Z)) ||
+ (isa<Constant>(Y) && isa<Constant>(W));
+ if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
return false;
Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
- Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+ Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
+ ? Builder.CreateBinOp(
+ cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
+ : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
// Intersect flags from the old binops.
if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
- NewInst->copyIRFlags(B0);
- NewInst->andIRFlags(B1);
+ NewInst->copyIRFlags(LHS);
+ NewInst->andIRFlags(RHS);
}
Worklist.pushValue(Shuf0);
@@ -1672,8 +1849,6 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
// Try to replace a castop with a shuffle if the shuffle is not costly.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
InstructionCost CostC0 =
TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
TTI::CastContextHint::None, CostKind);
@@ -1715,77 +1890,123 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
return true;
}
-/// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)"
+/// Try to convert any of:
+/// "shuffle (shuffle x, y), (shuffle y, x)"
+/// "shuffle (shuffle x, undef), (shuffle y, undef)"
+/// "shuffle (shuffle x, undef), y"
+/// "shuffle x, (shuffle y, undef)"
/// into "shuffle x, y".
bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
- Value *V0, *V1;
- UndefValue *U0, *U1;
- ArrayRef<int> OuterMask, InnerMask0, InnerMask1;
+ ArrayRef<int> OuterMask;
+ Value *OuterV0, *OuterV1;
if (!match(&I,
- m_Shuffle(
- m_Shuffle(m_Value(V0), m_UndefValue(U0), m_Mask(InnerMask0)),
- m_Shuffle(m_Value(V1), m_UndefValue(U1), m_Mask(InnerMask1)),
- m_Mask(OuterMask))))
+ m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
+ return false;
+
+ ArrayRef<int> InnerMask0, InnerMask1;
+ Value *X0, *X1, *Y0, *Y1;
+ bool Match0 =
+ match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
+ bool Match1 =
+ match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
+ if (!Match0 && !Match1)
return false;
- auto *ShufI0 = dyn_cast<Instruction>(I.getOperand(0));
- auto *ShufI1 = dyn_cast<Instruction>(I.getOperand(1));
+ X0 = Match0 ? X0 : OuterV0;
+ Y0 = Match0 ? Y0 : OuterV0;
+ X1 = Match1 ? X1 : OuterV1;
+ Y1 = Match1 ? Y1 : OuterV1;
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
- auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType());
- auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType());
+ auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
+ auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
- V0->getType() != V1->getType())
+ X0->getType() != X1->getType())
return false;
unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
unsigned NumImmElts = ShuffleImmTy->getNumElements();
- // Bail if either inner masks reference a RHS undef arg.
- if ((!isa<PoisonValue>(U0) &&
- any_of(InnerMask0, [&](int M) { return M >= (int)NumSrcElts; })) ||
- (!isa<PoisonValue>(U1) &&
- any_of(InnerMask1, [&](int M) { return M >= (int)NumSrcElts; })))
- return false;
-
- // Merge shuffles - replace index to the RHS poison arg with PoisonMaskElem,
+ // Attempt to merge shuffles, matching upto 2 source operands.
+ // Replace index to a poison arg with PoisonMaskElem.
+ // Bail if either inner masks reference an undef arg.
SmallVector<int, 16> NewMask(OuterMask);
+ Value *NewX = nullptr, *NewY = nullptr;
for (int &M : NewMask) {
+ Value *Src = nullptr;
if (0 <= M && M < (int)NumImmElts) {
- M = (InnerMask0[M] >= (int)NumSrcElts) ? PoisonMaskElem : InnerMask0[M];
+ Src = OuterV0;
+ if (Match0) {
+ M = InnerMask0[M];
+ Src = M >= (int)NumSrcElts ? Y0 : X0;
+ M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
+ }
} else if (M >= (int)NumImmElts) {
- if (InnerMask1[M - NumImmElts] >= (int)NumSrcElts)
+ Src = OuterV1;
+ M -= NumImmElts;
+ if (Match1) {
+ M = InnerMask1[M];
+ Src = M >= (int)NumSrcElts ? Y1 : X1;
+ M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
+ }
+ }
+ if (Src && M != PoisonMaskElem) {
+ assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
+ if (isa<UndefValue>(Src)) {
+ // We've referenced an undef element - if its poison, update the shuffle
+ // mask, else bail.
+ if (!isa<PoisonValue>(Src))
+ return false;
M = PoisonMaskElem;
- else
- M = InnerMask1[M - NumImmElts] + (V0 == V1 ? 0 : NumSrcElts);
+ continue;
+ }
+ if (!NewX || NewX == Src) {
+ NewX = Src;
+ continue;
+ }
+ if (!NewY || NewY == Src) {
+ M += NumSrcElts;
+ NewY = Src;
+ continue;
+ }
+ return false;
}
}
+ if (!NewX)
+ return PoisonValue::get(ShuffleDstTy);
+ if (!NewY)
+ NewY = PoisonValue::get(ShuffleSrcTy);
+
// Have we folded to an Identity shuffle?
if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
- replaceValue(I, *V0);
+ replaceValue(I, *NewX);
return true;
}
// Try to merge the shuffles if the new shuffle is not costly.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
- InstructionCost InnerCost0 =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
- InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0);
- InstructionCost InnerCost1 =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
- InnerMask1, CostKind, 0, nullptr, {V1, U1}, ShufI1);
- InstructionCost OuterCost =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy,
- OuterMask, CostKind, 0, nullptr, {ShufI0, ShufI1}, &I);
+ InstructionCost InnerCost0 = 0;
+ if (Match0)
+ InnerCost0 = TTI.getInstructionCost(cast<Instruction>(OuterV0), CostKind);
+
+ InstructionCost InnerCost1 = 0;
+ if (Match1)
+ InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind);
+
+ InstructionCost OuterCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, OuterMask, CostKind,
+ 0, nullptr, {OuterV0, OuterV1}, &I);
+
InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
- InstructionCost NewCost =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy,
- NewMask, CostKind, 0, nullptr, {V0, V1});
- if (!ShufI0->hasOneUse())
+ bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
+ TargetTransformInfo::ShuffleKind SK =
+ IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
+ : TargetTransformInfo::SK_PermuteTwoSrc;
+ InstructionCost NewCost = TTI.getShuffleCost(
+ SK, ShuffleSrcTy, NewMask, CostKind, 0, nullptr, {NewX, NewY});
+ if (!OuterV0->hasOneUse())
NewCost += InnerCost0;
- if (!ShufI1->hasOneUse())
+ if (!OuterV1->hasOneUse())
NewCost += InnerCost1;
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
@@ -1794,13 +2015,7 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
if (NewCost > OldCost)
return false;
- // Clear unused sources to poison.
- if (none_of(NewMask, [&](int M) { return 0 <= M && M < (int)NumSrcElts; }))
- V0 = PoisonValue::get(ShuffleSrcTy);
- if (none_of(NewMask, [&](int M) { return (int)NumSrcElts <= M; }))
- V1 = PoisonValue::get(ShuffleSrcTy);
-
- Value *Shuf = Builder.CreateShuffleVector(V0, V1, NewMask);
+ Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
replaceValue(I, *Shuf);
return true;
}
@@ -1832,32 +2047,30 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
return false;
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
- if (isVectorIntrinsicWithScalarOpAtArg(IID, I) &&
+ if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI) &&
II0->getArgOperand(I) != II1->getArgOperand(I))
return false;
InstructionCost OldCost =
- TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0),
- TTI::TCK_RecipThroughput) +
- TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1),
- TTI::TCK_RecipThroughput) +
+ TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
+ TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask,
- TTI::TCK_RecipThroughput, 0, nullptr, {II0, II1}, &I);
+ CostKind, 0, nullptr, {II0, II1}, &I);
SmallVector<Type *> NewArgsTy;
InstructionCost NewCost = 0;
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
- if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
NewArgsTy.push_back(II0->getArgOperand(I)->getType());
} else {
auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
VecTy->getNumElements() * 2));
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
- VecTy, OldMask, TTI::TCK_RecipThroughput);
+ VecTy, OldMask, CostKind);
}
IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
- NewCost += TTI.getIntrinsicInstrCost(NewAttr, TTI::TCK_RecipThroughput);
+ NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -1868,7 +2081,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
SmallVector<Value *> NewArgs;
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
- if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
NewArgs.push_back(II0->getArgOperand(I));
} else {
Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
@@ -1923,7 +2136,7 @@ generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
}
/// Detect concat of multiple values into a vector
-static bool isFreeConcat(ArrayRef<InstLane> Item,
+static bool isFreeConcat(ArrayRef<InstLane> Item, TTI::TargetCostKind CostKind,
const TargetTransformInfo &TTI) {
auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
unsigned NumElts = Ty->getNumElements();
@@ -1934,8 +2147,7 @@ static bool isFreeConcat(ArrayRef<InstLane> Item,
// during legalization.
SmallVector<int, 16> ConcatMask(NumElts * 2);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
- if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask,
- TTI::TCK_RecipThroughput) != 0)
+ if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, CostKind) != 0)
return false;
unsigned NumSlices = Item.size() / NumElts;
@@ -1960,7 +2172,8 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
const SmallPtrSet<Use *, 4> &IdentityLeafs,
const SmallPtrSet<Use *, 4> &SplatLeafs,
const SmallPtrSet<Use *, 4> &ConcatLeafs,
- IRBuilder<> &Builder) {
+ IRBuilder<> &Builder,
+ const TargetTransformInfo *TTI) {
auto [FrontU, FrontLane] = Item.front();
if (IdentityLeafs.contains(FrontU)) {
@@ -1995,13 +2208,14 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
SmallVector<Value *> Ops(NumOps);
for (unsigned Idx = 0; Idx < NumOps; Idx++) {
- if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) {
+ if (II &&
+ isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
Ops[Idx] = II->getOperand(Idx);
continue;
}
- Ops[Idx] =
- generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty,
- IdentityLeafs, SplatLeafs, ConcatLeafs, Builder);
+ Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
+ Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
+ Builder, TTI);
}
SmallVector<Value *, 8> ValueList;
@@ -2097,7 +2311,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
all_of(drop_begin(Item), [Item](InstLane &IL) {
Value *FrontV = Item.front().first->get();
Use *U = IL.first;
- return !U || U->get() == FrontV;
+ return !U || (isa<Constant>(U->get()) &&
+ cast<Constant>(U->get())->getSplatValue() ==
+ cast<Constant>(FrontV)->getSplatValue());
})) {
SplatLeafs.insert(FrontU);
continue;
@@ -2127,7 +2343,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
return false;
if (auto *CI = dyn_cast<CastInst>(V))
- if (CI->getSrcTy() != cast<CastInst>(FrontV)->getSrcTy())
+ if (CI->getSrcTy()->getScalarType() !=
+ cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
return false;
if (auto *SI = dyn_cast<SelectInst>(V))
if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
@@ -2152,7 +2369,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
continue;
- } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontU)) {
+ } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
+ FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
continue;
} else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
@@ -2173,7 +2391,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
II && isTriviallyVectorizable(II->getIntrinsicID()) &&
!II->hasOperandBundles()) {
for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
- if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
+ if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
+ &TTI)) {
if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
Value *FrontV = Item.front().first->get();
Use *U = IL.first;
@@ -2189,7 +2408,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
}
}
- if (isFreeConcat(Item, TTI)) {
+ if (isFreeConcat(Item, CostKind, TTI)) {
ConcatLeafs.insert(FrontU);
continue;
}
@@ -2200,11 +2419,13 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (NumVisited <= 1)
return false;
+ LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
+
// If we got this far, we know the shuffles are superfluous and can be
// removed. Scan through again and generate the new tree of instructions.
Builder.SetInsertPoint(&I);
Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
- ConcatLeafs, Builder);
+ ConcatLeafs, Builder, &TTI);
replaceValue(I, *V);
return true;
}
@@ -2306,10 +2527,10 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
(UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;
InstructionCost OldCost = TTI.getShuffleCost(
UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
- VecTyForCost, Shuffle->getShuffleMask());
+ VecTyForCost, Shuffle->getShuffleMask(), CostKind);
InstructionCost NewCost = TTI.getShuffleCost(
UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
- VecTyForCost, ConcatMask);
+ VecTyForCost, ConcatMask, CostKind);
LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
<< "\n");
@@ -2367,7 +2588,6 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
Type *ResultTy = I.getType();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost OldCost = TTI.getArithmeticReductionCost(
ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
@@ -2624,17 +2844,17 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
? TTI::SK_PermuteSingleSrc
: TTI::SK_PermuteTwoSrc,
- VT, SV->getShuffleMask());
+ VT, SV->getShuffleMask(), CostKind);
};
auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
- return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);
+ return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask, CostKind);
};
// Get the costs of the shuffles + binops before and after with the new
// shuffle masks.
InstructionCost CostBefore =
- TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) +
- TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);
+ TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
+ TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
InstructionCost(0), AddShuffleCost);
CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
@@ -2647,8 +2867,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
FixedVectorType *Op1SmallVT =
FixedVectorType::get(VT->getScalarType(), V2.size());
InstructionCost CostAfter =
- TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) +
- TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT);
+ TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
+ TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
InstructionCost(0), AddShuffleMaskCost);
std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
@@ -2717,7 +2937,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
/// Cost model calculations takes into account if zext(x) has other users and
/// whether it can be propagated through them too.
-bool VectorCombine::shrinkType(llvm::Instruction &I) {
+bool VectorCombine::shrinkType(Instruction &I) {
Value *ZExted, *OtherOperand;
if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
m_Value(OtherOperand))) &&
@@ -2746,7 +2966,6 @@ bool VectorCombine::shrinkType(llvm::Instruction &I) {
// Calculate costs of leaving current IR as it is and moving ZExt operation
// later, along with adding truncates if needed
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost ZExtCost = TTI.getCastInstrCost(
Instruction::ZExt, BigTy, SmallTy,
TargetTransformInfo::CastContextHint::None, CostKind);
@@ -2826,26 +3045,46 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
if (ExtIdx >= NumElts || InsIdx >= NumElts)
return false;
- SmallVector<int> Mask(NumElts, 0);
- std::iota(Mask.begin(), Mask.end(), 0);
- Mask[InsIdx] = ExtIdx + NumElts;
+ // Insertion into poison is a cheaper single operand shuffle.
+ TargetTransformInfo::ShuffleKind SK;
+ SmallVector<int> Mask(NumElts, PoisonMaskElem);
+ if (isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
+ SK = TargetTransformInfo::SK_PermuteSingleSrc;
+ Mask[InsIdx] = ExtIdx;
+ std::swap(DstVec, SrcVec);
+ } else {
+ SK = TargetTransformInfo::SK_PermuteTwoSrc;
+ std::iota(Mask.begin(), Mask.end(), 0);
+ Mask[InsIdx] = ExtIdx + NumElts;
+ }
+
// Cost
auto *Ins = cast<InsertElementInst>(&I);
auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
-
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- InstructionCost OldCost =
- TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx) +
+ InstructionCost InsCost =
TTI.getVectorInstrCost(*Ins, VecTy, CostKind, InsIdx);
+ InstructionCost ExtCost =
+ TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
+ InstructionCost OldCost = ExtCost + InsCost;
- InstructionCost NewCost =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask);
+ InstructionCost NewCost = TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0,
+ nullptr, {DstVec, SrcVec});
if (!Ext->hasOneUse())
- NewCost += TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
+ NewCost += ExtCost;
+
+ LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair : " << I
+ << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
if (OldCost < NewCost)
return false;
+ // Canonicalize undef param to RHS to help further folds.
+ if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
+ ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+ std::swap(DstVec, SrcVec);
+ }
+
Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
replaceValue(I, *Shuf);
@@ -2862,12 +3101,17 @@ bool VectorCombine::run() {
if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
return false;
+ LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
+
bool MadeChange = false;
auto FoldInst = [this, &MadeChange](Instruction &I) {
Builder.SetInsertPoint(&I);
+ bool IsVectorType = isa<VectorType>(I.getType());
bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
auto Opcode = I.getOpcode();
+ LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
+
// These folds should be beneficial regardless of when this pass is run
// in the optimization pipeline.
// The type checking is for run-time efficiency. We can avoid wasting time
@@ -2887,7 +3131,7 @@ bool VectorCombine::run() {
// This transform works with scalable and fixed vectors
// TODO: Identify and allow other scalable transforms
- if (isa<VectorType>(I.getType())) {
+ if (IsVectorType) {
MadeChange |= scalarizeBinopOrCmp(I);
MadeChange |= scalarizeLoadExtract(I);
MadeChange |= scalarizeVPIntrinsic(I);
@@ -2936,6 +3180,9 @@ bool VectorCombine::run() {
case Instruction::FCmp:
MadeChange |= foldExtractExtract(I);
break;
+ case Instruction::Or:
+ MadeChange |= foldConcatOfBoolMasks(I);
+ [[fallthrough]];
default:
if (Instruction::isBinaryOp(Opcode)) {
MadeChange |= foldExtractExtract(I);
@@ -2981,7 +3228,8 @@ PreservedAnalyses VectorCombinePass::run(Function &F,
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
AAResults &AA = FAM.getResult<AAManager>(F);
const DataLayout *DL = &F.getDataLayout();
- VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly);
+ VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
+ TryEarlyFoldsOnly);
if (!Combiner.run())
return PreservedAnalyses::all();
PreservedAnalyses PA;