summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp42
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroFrame.cpp3
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroSplit.cpp14
-rw-r--r--llvm/lib/Transforms/IPO/FunctionImport.cpp195
-rw-r--r--llvm/lib/Transforms/IPO/FunctionSpecialization.cpp50
-rw-r--r--llvm/lib/Transforms/IPO/GlobalOpt.cpp3
-rw-r--r--llvm/lib/Transforms/IPO/IROutliner.cpp3
-rw-r--r--llvm/lib/Transforms/IPO/LowerTypeTests.cpp13
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/SCCP.cpp16
-rw-r--r--llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp28
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp190
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp16
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp11
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp21
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp131
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h19
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp5
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp62
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp10
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp318
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp48
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp120
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp190
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp116
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp42
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/InferAlignment.cpp35
-rw-r--r--llvm/lib/Transforms/Scalar/JumpThreading.cpp11
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp176
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp76
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp27
-rw-r--r--llvm/lib/Transforms/Scalar/StructurizeCFG.cpp3
-rw-r--r--llvm/lib/Transforms/Utils/CodeLayout.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/Debugify.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/FunctionImportUtils.cpp3
-rw-r--r--llvm/lib/Transforms/Utils/IRNormalizer.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/LoopPeel.cpp20
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnroll.cpp138
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/LoopVersioning.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/ProfileVerify.cpp11
-rw-r--r--llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp7
-rw-r--r--llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp105
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp39
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp359
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp113
-rw-r--r--llvm/lib/Transforms/Utils/SymbolRewriter.cpp72
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp108
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp172
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h24
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp931
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp338
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp146
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h250
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp93
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h47
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp410
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp542
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h67
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp25
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h5
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp165
73 files changed, 4207 insertions, 2025 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 40a7f8043034..40de36d81ddd 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -83,8 +83,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
// == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))
if (match(V, m_OneUse(m_c_Or(
m_Shl(m_Value(ShVal0), m_Value(ShAmt)),
- m_LShr(m_Value(ShVal1),
- m_Sub(m_SpecificInt(Width), m_Deferred(ShAmt))))))) {
+ m_LShr(m_Value(ShVal1), m_Sub(m_SpecificInt(Width),
+ m_Deferred(ShAmt))))))) {
return Intrinsic::fshl;
}
@@ -617,7 +617,7 @@ struct LoadOps {
LoadInst *RootInsert = nullptr;
bool FoundRoot = false;
uint64_t LoadSize = 0;
- const APInt *Shift = nullptr;
+ uint64_t Shift = 0;
Type *ZextType;
AAMDNodes AATags;
};
@@ -627,17 +627,15 @@ struct LoadOps {
// (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3)
static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
AliasAnalysis &AA) {
- const APInt *ShAmt2 = nullptr;
+ uint64_t ShAmt2;
Value *X;
Instruction *L1, *L2;
// Go to the last node with loads.
- if (match(V, m_OneUse(m_c_Or(
- m_Value(X),
- m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))),
- m_APInt(ShAmt2)))))) ||
- match(V, m_OneUse(m_Or(m_Value(X),
- m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))))))) {
+ if (match(V,
+ m_OneUse(m_c_Or(m_Value(X), m_OneUse(m_ShlOrSelf(
+ m_OneUse(m_ZExt(m_Instruction(L2))),
+ ShAmt2)))))) {
if (!foldLoadsRecursive(X, LOps, DL, AA) && LOps.FoundRoot)
// Avoid Partial chain merge.
return false;
@@ -646,11 +644,10 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
// Check if the pattern has loads
LoadInst *LI1 = LOps.Root;
- const APInt *ShAmt1 = LOps.Shift;
+ uint64_t ShAmt1 = LOps.Shift;
if (LOps.FoundRoot == false &&
- (match(X, m_OneUse(m_ZExt(m_Instruction(L1)))) ||
- match(X, m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L1)))),
- m_APInt(ShAmt1)))))) {
+ match(X, m_OneUse(
+ m_ShlOrSelf(m_OneUse(m_ZExt(m_Instruction(L1))), ShAmt1)))) {
LI1 = dyn_cast<LoadInst>(L1);
}
LoadInst *LI2 = dyn_cast<LoadInst>(L2);
@@ -726,13 +723,6 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
if (IsBigEndian)
std::swap(ShAmt1, ShAmt2);
- // Find Shifts values.
- uint64_t Shift1 = 0, Shift2 = 0;
- if (ShAmt1)
- Shift1 = ShAmt1->getZExtValue();
- if (ShAmt2)
- Shift2 = ShAmt2->getZExtValue();
-
// First load is always LI1. This is where we put the new load.
// Use the merged load size available from LI1 for forward loads.
if (LOps.FoundRoot) {
@@ -747,7 +737,7 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1;
uint64_t PrevSize =
DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1));
- if ((Shift2 - Shift1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)
+ if ((ShAmt2 - ShAmt1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)
return false;
// Update LOps
@@ -824,7 +814,7 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
// Check if shift needed. We need to shift with the amount of load1
// shift if not zero.
if (LOps.Shift)
- NewOp = Builder.CreateShl(NewOp, ConstantInt::get(I.getContext(), *LOps.Shift));
+ NewOp = Builder.CreateShl(NewOp, LOps.Shift);
I.replaceAllUsesWith(NewOp);
return true;
@@ -860,11 +850,9 @@ static std::optional<PartStore> matchPartStore(Instruction &I,
return std::nullopt;
uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
- uint64_t ValOffset = 0;
+ uint64_t ValOffset;
Value *Val;
- if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val),
- m_ConstantInt(ValOffset))),
- m_Trunc(m_Value(Val)))))
+ if (!match(StoredVal, m_Trunc(m_LShrOrSelf(m_Value(Val), ValOffset))))
return std::nullopt;
Value *Ptr = Store->getPointerOperand();
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index b775c4346019..08f03aa45255 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -700,9 +700,6 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false);
- assert(Shape.getPromiseAlloca() &&
- "Coroutine with switch ABI should own Promise alloca");
-
DIFile *DFile = DIS->getFile();
unsigned LineNum = DIS->getLine();
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 180ac9c61e7d..02c38d02cff6 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1568,14 +1568,22 @@ private:
if (DebugLoc SuspendLoc = S->getDebugLoc()) {
std::string LabelName =
("__coro_resume_" + Twine(SuspendIndex)).str();
- DILocation &DILoc = *SuspendLoc;
+ // Take the "inlined at" location recursively, if present. This is
+ // mandatory as the DILabel insertion checks that the scopes of label
+ // and the attached location match. This is not the case when the
+ // suspend location has been inlined due to pointing to the original
+ // scope.
+ DILocation *DILoc = SuspendLoc;
+ while (DILocation *InlinedAt = DILoc->getInlinedAt())
+ DILoc = InlinedAt;
+
DILabel *ResumeLabel =
- DBuilder.createLabel(DIS, LabelName, DILoc.getFile(),
+ DBuilder.createLabel(DIS, LabelName, DILoc->getFile(),
SuspendLoc.getLine(), SuspendLoc.getCol(),
/*IsArtificial=*/true,
/*CoroSuspendIdx=*/SuspendIndex,
/*AlwaysPreserve=*/false);
- DBuilder.insertLabel(ResumeLabel, &DILoc, ResumeBB->begin());
+ DBuilder.insertLabel(ResumeLabel, DILoc, ResumeBB->begin());
}
}
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 7bcb20de46ff..83aa7de5400f 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -40,6 +40,7 @@
#include "llvm/Support/JSON.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TimeProfiler.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO/Internalize.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -1550,6 +1551,7 @@ void llvm::computeDeadSymbolsWithConstProp(
const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
bool ImportEnabled) {
+ llvm::TimeTraceScope timeScope("Drop dead symbols and propagate attributes");
computeDeadSymbolsAndUpdateIndirectCalls(Index, GUIDPreservedSymbols,
isPrevailing);
if (ImportEnabled)
@@ -1664,6 +1666,7 @@ bool llvm::convertToDeclaration(GlobalValue &GV) {
void llvm::thinLTOFinalizeInModule(Module &TheModule,
const GVSummaryMapTy &DefinedGlobals,
bool PropagateAttrs) {
+ llvm::TimeTraceScope timeScope("ThinLTO finalize in module");
DenseSet<Comdat *> NonPrevailingComdats;
auto FinalizeInModule = [&](GlobalValue &GV, bool Propagate = false) {
// See if the global summary analysis computed a new resolved linkage.
@@ -1791,6 +1794,7 @@ void llvm::thinLTOFinalizeInModule(Module &TheModule,
/// Run internalization on \p TheModule based on symmary analysis.
void llvm::thinLTOInternalizeModule(Module &TheModule,
const GVSummaryMapTy &DefinedGlobals) {
+ llvm::TimeTraceScope timeScope("ThinLTO internalize module");
// Declare a callback for the internalize pass that will ask for every
// candidate GlobalValue if it can be internalized or not.
auto MustPreserveGV = [&](const GlobalValue &GV) -> bool {
@@ -1885,6 +1889,7 @@ Expected<bool> FunctionImporter::importFunctions(
// Do the actual import of functions now, one Module at a time
for (const auto &ModName : ImportList.getSourceModules()) {
+ llvm::TimeTraceScope timeScope("Import", ModName);
// Get the module for the import
Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(ModName);
if (!SrcModuleOrErr)
@@ -1900,102 +1905,114 @@ Expected<bool> FunctionImporter::importFunctions(
// Find the globals to import
SetVector<GlobalValue *> GlobalsToImport;
- for (Function &F : *SrcModule) {
- if (!F.hasName())
- continue;
- auto GUID = F.getGUID();
- auto MaybeImportType = ImportList.getImportType(ModName, GUID);
- bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition;
-
- LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
- << " importing function"
- << (ImportDefinition
- ? " definition "
- : (MaybeImportType ? " declaration " : " "))
- << GUID << " " << F.getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
- if (ImportDefinition) {
- if (Error Err = F.materialize())
- return std::move(Err);
- // MemProf should match function's definition and summary,
- // 'thinlto_src_module' is needed.
- if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
- // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
- // statistics and debugging.
- F.setMetadata(
- "thinlto_src_module",
- MDNode::get(DestModule.getContext(),
- {MDString::get(DestModule.getContext(),
- SrcModule->getModuleIdentifier())}));
- F.setMetadata(
- "thinlto_src_file",
- MDNode::get(DestModule.getContext(),
- {MDString::get(DestModule.getContext(),
- SrcModule->getSourceFileName())}));
+ {
+ llvm::TimeTraceScope functionsScope("Functions");
+ for (Function &F : *SrcModule) {
+ if (!F.hasName())
+ continue;
+ auto GUID = F.getGUID();
+ auto MaybeImportType = ImportList.getImportType(ModName, GUID);
+ bool ImportDefinition =
+ MaybeImportType == GlobalValueSummary::Definition;
+
+ LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+ << " importing function"
+ << (ImportDefinition
+ ? " definition "
+ : (MaybeImportType ? " declaration " : " "))
+ << GUID << " " << F.getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
+ if (ImportDefinition) {
+ if (Error Err = F.materialize())
+ return std::move(Err);
+ // MemProf should match function's definition and summary,
+ // 'thinlto_src_module' is needed.
+ if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
+ // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
+ // statistics and debugging.
+ F.setMetadata(
+ "thinlto_src_module",
+ MDNode::get(DestModule.getContext(),
+ {MDString::get(DestModule.getContext(),
+ SrcModule->getModuleIdentifier())}));
+ F.setMetadata(
+ "thinlto_src_file",
+ MDNode::get(DestModule.getContext(),
+ {MDString::get(DestModule.getContext(),
+ SrcModule->getSourceFileName())}));
+ }
+ GlobalsToImport.insert(&F);
}
- GlobalsToImport.insert(&F);
}
}
- for (GlobalVariable &GV : SrcModule->globals()) {
- if (!GV.hasName())
- continue;
- auto GUID = GV.getGUID();
- auto MaybeImportType = ImportList.getImportType(ModName, GUID);
- bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition;
-
- LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
- << " importing global"
- << (ImportDefinition
- ? " definition "
- : (MaybeImportType ? " declaration " : " "))
- << GUID << " " << GV.getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
- if (ImportDefinition) {
- if (Error Err = GV.materialize())
- return std::move(Err);
- ImportedGVCount += GlobalsToImport.insert(&GV);
+ {
+ llvm::TimeTraceScope globalsScope("Globals");
+ for (GlobalVariable &GV : SrcModule->globals()) {
+ if (!GV.hasName())
+ continue;
+ auto GUID = GV.getGUID();
+ auto MaybeImportType = ImportList.getImportType(ModName, GUID);
+ bool ImportDefinition =
+ MaybeImportType == GlobalValueSummary::Definition;
+
+ LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+ << " importing global"
+ << (ImportDefinition
+ ? " definition "
+ : (MaybeImportType ? " declaration " : " "))
+ << GUID << " " << GV.getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
+ if (ImportDefinition) {
+ if (Error Err = GV.materialize())
+ return std::move(Err);
+ ImportedGVCount += GlobalsToImport.insert(&GV);
+ }
}
}
- for (GlobalAlias &GA : SrcModule->aliases()) {
- if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject()))
- continue;
- auto GUID = GA.getGUID();
- auto MaybeImportType = ImportList.getImportType(ModName, GUID);
- bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition;
-
- LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
- << " importing alias"
- << (ImportDefinition
- ? " definition "
- : (MaybeImportType ? " declaration " : " "))
- << GUID << " " << GA.getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
- if (ImportDefinition) {
- if (Error Err = GA.materialize())
- return std::move(Err);
- // Import alias as a copy of its aliasee.
- GlobalObject *GO = GA.getAliaseeObject();
- if (Error Err = GO->materialize())
- return std::move(Err);
- auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
- LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() << " "
- << GO->getName() << " from "
+ {
+ llvm::TimeTraceScope aliasesScope("Aliases");
+ for (GlobalAlias &GA : SrcModule->aliases()) {
+ if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject()))
+ continue;
+ auto GUID = GA.getGUID();
+ auto MaybeImportType = ImportList.getImportType(ModName, GUID);
+ bool ImportDefinition =
+ MaybeImportType == GlobalValueSummary::Definition;
+
+ LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+ << " importing alias"
+ << (ImportDefinition
+ ? " definition "
+ : (MaybeImportType ? " declaration " : " "))
+ << GUID << " " << GA.getName() << " from "
<< SrcModule->getSourceFileName() << "\n");
- if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
- // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
- // statistics and debugging.
- Fn->setMetadata(
- "thinlto_src_module",
- MDNode::get(DestModule.getContext(),
- {MDString::get(DestModule.getContext(),
- SrcModule->getModuleIdentifier())}));
- Fn->setMetadata(
- "thinlto_src_file",
- MDNode::get(DestModule.getContext(),
- {MDString::get(DestModule.getContext(),
- SrcModule->getSourceFileName())}));
+ if (ImportDefinition) {
+ if (Error Err = GA.materialize())
+ return std::move(Err);
+ // Import alias as a copy of its aliasee.
+ GlobalObject *GO = GA.getAliaseeObject();
+ if (Error Err = GO->materialize())
+ return std::move(Err);
+ auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
+ LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID()
+ << " " << GO->getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
+ if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
+ // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
+ // statistics and debugging.
+ Fn->setMetadata(
+ "thinlto_src_module",
+ MDNode::get(DestModule.getContext(),
+ {MDString::get(DestModule.getContext(),
+ SrcModule->getModuleIdentifier())}));
+ Fn->setMetadata(
+ "thinlto_src_file",
+ MDNode::get(DestModule.getContext(),
+ {MDString::get(DestModule.getContext(),
+ SrcModule->getSourceFileName())}));
+ }
+ GlobalsToImport.insert(Fn);
}
- GlobalsToImport.insert(Fn);
}
}
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 9196a0147c43..30459caee160 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -89,6 +89,8 @@ static cl::opt<bool> SpecializeLiteralConstant(
"Enable specialization of functions that take a literal constant as an "
"argument"));
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB,
BasicBlock *Succ) const {
unsigned I = 0;
@@ -784,9 +786,31 @@ bool FunctionSpecializer::run() {
// Update the known call sites to call the clone.
for (CallBase *Call : S.CallSites) {
+ Function *Clone = S.Clone;
LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call
- << " to call " << S.Clone->getName() << "\n");
+ << " to call " << Clone->getName() << "\n");
Call->setCalledFunction(S.Clone);
+ auto &BFI = GetBFI(*Call->getFunction());
+ std::optional<uint64_t> Count =
+ BFI.getBlockProfileCount(Call->getParent());
+ if (Count && !ProfcheckDisableMetadataFixes) {
+ std::optional<llvm::Function::ProfileCount> MaybeCloneCount =
+ Clone->getEntryCount();
+ assert(MaybeCloneCount && "Clone entry count was not set!");
+ uint64_t CallCount = *Count + MaybeCloneCount->getCount();
+ Clone->setEntryCount(CallCount);
+ if (std::optional<llvm::Function::ProfileCount> MaybeOriginalCount =
+ S.F->getEntryCount()) {
+ uint64_t OriginalCount = MaybeOriginalCount->getCount();
+ if (OriginalCount >= CallCount) {
+ S.F->setEntryCount(OriginalCount - CallCount);
+ } else {
+ // This should generally not happen as that would mean there are
+ // more computed calls to the function than what was recorded.
+ LLVM_DEBUG(S.F->setEntryCount(0));
+ }
+ }
+ }
}
Clones.push_back(S.Clone);
@@ -838,14 +862,24 @@ bool FunctionSpecializer::run() {
}
void FunctionSpecializer::removeDeadFunctions() {
- for (Function *F : FullySpecialized) {
+ for (Function *F : DeadFunctions) {
LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function "
<< F->getName() << "\n");
if (FAM)
FAM->clear(*F, F->getName());
+
+ // Remove all the callsites that were proven unreachable once, and replace
+ // them with poison.
+ for (User *U : make_early_inc_range(F->users())) {
+ assert((isa<CallInst>(U) || isa<InvokeInst>(U)) &&
+ "User of dead function must be call or invoke");
+ Instruction *CS = cast<Instruction>(U);
+ CS->replaceAllUsesWith(PoisonValue::get(CS->getType()));
+ CS->eraseFromParent();
+ }
F->eraseFromParent();
}
- FullySpecialized.clear();
+ DeadFunctions.clear();
}
/// Clone the function \p F and remove the ssa_copy intrinsics added by
@@ -1033,6 +1067,9 @@ Function *FunctionSpecializer::createSpecialization(Function *F,
// clone must.
Clone->setLinkage(GlobalValue::InternalLinkage);
+ if (F->getEntryCount() && !ProfcheckDisableMetadataFixes)
+ Clone->setEntryCount(0);
+
// Initialize the lattice state of the arguments of the function clone,
// marking the argument on which we specialized the function constant
// with the given value.
@@ -1206,8 +1243,11 @@ void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin,
// If the function has been completely specialized, the original function
// is no longer needed. Mark it unreachable.
- if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F)) {
+ // NOTE: If the address of a function is taken, we cannot treat it as dead
+ // function.
+ if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F) &&
+ !F->hasAddressTaken()) {
Solver.markFunctionUnreachable(F);
- FullySpecialized.insert(F);
+ DeadFunctions.insert(F);
}
}
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index d7edd1288309..f88d51f443bc 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2551,7 +2551,8 @@ static bool OptimizeNonTrivialIFuncs(
}))
continue;
- assert(!Callees.empty() && "Expecting successful collection of versions");
+ if (Callees.empty())
+ continue;
LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
<< Resolver->getName() << "\n");
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index c57981ae4ca0..fdf0c3ac8007 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -686,9 +686,6 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
/* Outlined code is optimized code by definition. */
DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
- // Don't add any new variables to the subprogram.
- DB.finalizeSubprogram(OutlinedSP);
-
// Attach subprogram to the function.
F->setSubprogram(OutlinedSP);
// We're done with the DIBuilder.
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 57844a10aa9c..821a9d82ddb0 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -504,10 +504,7 @@ class LowerTypeTestsModule {
void importTypeTest(CallInst *CI);
void importFunction(Function *F, bool isJumpTableCanonical);
- BitSetInfo
- buildBitSet(Metadata *TypeId,
- const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
- ByteArrayInfo *createByteArray(BitSetInfo &BSI);
+ ByteArrayInfo *createByteArray(const BitSetInfo &BSI);
void allocateByteArrays();
Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL,
Value *BitOffset);
@@ -578,9 +575,9 @@ public:
/// Build a bit set for TypeId using the object layouts in
/// GlobalLayout.
-BitSetInfo LowerTypeTestsModule::buildBitSet(
- Metadata *TypeId,
- const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
+static BitSetInfo
+buildBitSet(Metadata *TypeId,
+ const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
BitSetBuilder BSB;
// Compute the byte offset of each address associated with this type
@@ -615,7 +612,7 @@ static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
}
-ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) {
+ByteArrayInfo *LowerTypeTestsModule::createByteArray(const BitSetInfo &BSI) {
// Create globals to stand in for byte arrays and masks. These never actually
// get initialized, we RAUW and erase them later in allocateByteArrays() once
// we know the offset and mask to use.
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b8c99f1f3389..7f9693169af0 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3965,6 +3965,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
void ModuleCallsiteContextGraph::updateAllocationCall(
CallInfo &Call, AllocationType AllocType) {
std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
+ removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call()));
auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
"memprof", AllocTypeString);
cast<CallBase>(Call.call())->addFnAttr(A);
@@ -5501,6 +5502,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// clone J-1 (J==0 is the original clone and does not have a VMaps
// entry).
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+ removeAnyExistingAmbiguousAttribute(CBClone);
CBClone->addFnAttr(A);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
<< ore::NV("AllocationCall", CBClone) << " in clone "
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index d50de34dfa48..2ecadd529170 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -169,6 +169,13 @@ static bool runIPSCCP(
for (Function &F : M) {
if (F.isDeclaration())
continue;
+ // Skip the dead functions marked by FunctionSpecializer, avoiding removing
+ // blocks in dead functions. Set MadeChanges if there is any dead function
+ // that will be removed later.
+ if (IsFuncSpecEnabled && Specializer.isDeadFunction(&F)) {
+ MadeChanges = true;
+ continue;
+ }
SmallVector<BasicBlock *, 512> BlocksToErase;
@@ -326,12 +333,15 @@ static bool runIPSCCP(
LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
<< "' is constant!\n");
for (User *U : make_early_inc_range(GV->users())) {
- // We can remove LoadInst here, because we already replaced its users
- // with a constant.
+ // We can remove LoadInst here. The LoadInsts in dead functions marked by
+ // FuncSpec are not simplified to constants, thus poison them.
assert((isa<StoreInst>(U) || isa<LoadInst>(U)) &&
"Only Store|Load Instruction can be user of GlobalVariable at "
"reaching here.");
- cast<Instruction>(U)->eraseFromParent();
+ Instruction *I = cast<Instruction>(U);
+ if (isa<LoadInst>(I))
+ I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+ I->eraseFromParent();
}
// Try to create a debug constant expression for the global variable
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 838f97c8f49a..2340fe556538 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -269,6 +269,12 @@ static bool enableUnifiedLTO(Module &M) {
}
#endif
+bool mustEmitToMergedModule(const GlobalValue *GV) {
+ // The __cfi_check definition is filled in by the CrossDSOCFI pass which
+ // runs only in the merged module.
+ return GV->getName() == "__cfi_check";
+}
+
// If it's possible to split M into regular and thin LTO parts, do so and write
// a multi-module bitcode file with the two parts to OS. Otherwise, write only a
// regular LTO bitcode file to OS.
@@ -350,19 +356,13 @@ void splitAndWriteThinLTOBitcode(
});
}
- auto MustEmitToMergedModule = [](const GlobalValue *GV) {
- // The __cfi_check definition is filled in by the CrossDSOCFI pass which
- // runs only in the merged module.
- return GV->getName() == "__cfi_check";
- };
-
ValueToValueMapTy VMap;
std::unique_ptr<Module> MergedM(
CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool {
if (const auto *C = GV->getComdat())
if (MergedMComdats.count(C))
return true;
- if (MustEmitToMergedModule(GV))
+ if (mustEmitToMergedModule(GV))
return true;
if (auto *F = dyn_cast<Function>(GV))
return EligibleVirtualFns.count(F);
@@ -380,7 +380,7 @@ void splitAndWriteThinLTOBitcode(
cloneUsedGlobalVariables(M, *MergedM, /*CompilerUsed*/ true);
for (Function &F : *MergedM)
- if (!F.isDeclaration() && !MustEmitToMergedModule(&F)) {
+ if (!F.isDeclaration() && !mustEmitToMergedModule(&F)) {
// Reset the linkage of all functions eligible for virtual constant
// propagation. The canonical definitions live in the thin LTO module so
// that they can be imported.
@@ -406,7 +406,7 @@ void splitAndWriteThinLTOBitcode(
if (const auto *C = GV->getComdat())
if (MergedMComdats.count(C))
return false;
- if (MustEmitToMergedModule(GV))
+ if (mustEmitToMergedModule(GV))
return false;
return true;
});
@@ -529,11 +529,13 @@ bool enableSplitLTOUnit(Module &M) {
return EnableSplitLTOUnit;
}
-// Returns whether this module needs to be split because it uses type metadata.
-bool hasTypeMetadata(Module &M) {
+// Returns whether this module needs to be split (if splitting is enabled).
+bool requiresSplit(Module &M) {
for (auto &GO : M.global_objects()) {
if (GO.hasMetadata(LLVMContext::MD_type))
return true;
+ if (mustEmitToMergedModule(&GO))
+ return true;
}
return false;
}
@@ -543,9 +545,9 @@ bool writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
Module &M, const ModuleSummaryIndex *Index,
const bool ShouldPreserveUseListOrder) {
std::unique_ptr<ModuleSummaryIndex> NewIndex = nullptr;
- // See if this module has any type metadata. If so, we try to split it
+ // See if this module needs to be split. If so, we try to split it
// or at least promote type ids to enable WPD.
- if (hasTypeMetadata(M)) {
+ if (requiresSplit(M)) {
if (enableSplitLTOUnit(M)) {
splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M,
ShouldPreserveUseListOrder);
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index aec484f8a18f..bfb25c806e53 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -60,6 +60,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TypeMetadataUtils.h"
#include "llvm/Bitcode/BitcodeReader.h"
@@ -68,6 +69,7 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
@@ -82,12 +84,15 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ModuleSummaryIndexYAML.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/GlobPattern.h"
+#include "llvm/Support/TimeProfiler.h"
#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/FunctionAttrs.h"
@@ -95,6 +100,7 @@
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
#include "llvm/Transforms/Utils/Evaluator.h"
#include <algorithm>
+#include <cmath>
#include <cstddef>
#include <map>
#include <set>
@@ -167,6 +173,8 @@ static cl::list<std::string>
cl::desc("Prevent function(s) from being devirtualized"),
cl::Hidden, cl::CommaSeparated);
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
/// With Clang, a pure virtual class's deleting destructor is emitted as a
/// `llvm.trap` intrinsic followed by an unreachable IR instruction. In the
/// context of whole program devirtualization, the deleting destructor of a pure
@@ -451,21 +459,21 @@ struct VirtualCallSite {
void
emitRemark(const StringRef OptName, const StringRef TargetName,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
+ function_ref<OptimizationRemarkEmitter &(Function &)> OREGetter) {
Function *F = CB.getCaller();
DebugLoc DLoc = CB.getDebugLoc();
BasicBlock *Block = CB.getParent();
using namespace ore;
- OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
- << NV("Optimization", OptName)
- << ": devirtualized a call to "
- << NV("FunctionName", TargetName));
+ OREGetter(*F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
+ << NV("Optimization", OptName)
+ << ": devirtualized a call to "
+ << NV("FunctionName", TargetName));
}
void replaceAndErase(
const StringRef OptName, const StringRef TargetName, bool RemarksEnabled,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+ function_ref<OptimizationRemarkEmitter &(Function &)> OREGetter,
Value *New) {
if (RemarksEnabled)
emitRemark(OptName, TargetName, OREGetter);
@@ -570,25 +578,24 @@ void VTableSlotInfo::addCallSite(Value *VTable, CallBase &CB,
struct DevirtModule {
Module &M;
- function_ref<AAResults &(Function &)> AARGetter;
- function_ref<DominatorTree &(Function &)> LookupDomTree;
+ ModuleAnalysisManager &MAM;
+ FunctionAnalysisManager &FAM;
- ModuleSummaryIndex *ExportSummary;
- const ModuleSummaryIndex *ImportSummary;
+ ModuleSummaryIndex *const ExportSummary;
+ const ModuleSummaryIndex *const ImportSummary;
- IntegerType *Int8Ty;
- PointerType *Int8PtrTy;
- IntegerType *Int32Ty;
- IntegerType *Int64Ty;
- IntegerType *IntPtrTy;
+ IntegerType *const Int8Ty;
+ PointerType *const Int8PtrTy;
+ IntegerType *const Int32Ty;
+ IntegerType *const Int64Ty;
+ IntegerType *const IntPtrTy;
/// Sizeless array type, used for imported vtables. This provides a signal
/// to analyzers that these imports may alias, as they do for example
/// when multiple unique return values occur in the same vtable.
- ArrayType *Int8Arr0Ty;
-
- bool RemarksEnabled;
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
+ ArrayType *const Int8Arr0Ty;
+ const bool RemarksEnabled;
+ std::function<OptimizationRemarkEmitter &(Function &)> OREGetter;
MapVector<VTableSlot, VTableSlotInfo> CallSlots;
// Calls that have already been optimized. We may add a call to multiple
@@ -611,12 +618,11 @@ struct DevirtModule {
std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
PatternList FunctionsToSkip;
- DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
- function_ref<DominatorTree &(Function &)> LookupDomTree,
+ DevirtModule(Module &M, ModuleAnalysisManager &MAM,
ModuleSummaryIndex *ExportSummary,
const ModuleSummaryIndex *ImportSummary)
- : M(M), AARGetter(AARGetter), LookupDomTree(LookupDomTree),
+ : M(M), MAM(MAM),
+ FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
ExportSummary(ExportSummary), ImportSummary(ImportSummary),
Int8Ty(Type::getInt8Ty(M.getContext())),
Int8PtrTy(PointerType::getUnqual(M.getContext())),
@@ -624,7 +630,10 @@ struct DevirtModule {
Int64Ty(Type::getInt64Ty(M.getContext())),
IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
Int8Arr0Ty(ArrayType::get(Type::getInt8Ty(M.getContext()), 0)),
- RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) {
+ RemarksEnabled(areRemarksEnabled()),
+ OREGetter([&](Function &F) -> OptimizationRemarkEmitter & {
+ return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ }) {
assert(!(ExportSummary && ImportSummary));
FunctionsToSkip.init(SkipFunctionNames);
}
@@ -653,7 +662,7 @@ struct DevirtModule {
VTableSlotInfo &SlotInfo,
WholeProgramDevirtResolution *Res);
- void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT,
+ void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Function &JT,
bool &IsExported);
void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
VTableSlotInfo &SlotInfo,
@@ -738,10 +747,7 @@ struct DevirtModule {
// Lower the module using the action and summary passed as command line
// arguments. For testing purposes only.
- static bool
- runForTesting(Module &M, function_ref<AAResults &(Function &)> AARGetter,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
- function_ref<DominatorTree &(Function &)> LookupDomTree);
+ static bool runForTesting(Module &M, ModuleAnalysisManager &MAM);
};
struct DevirtIndex {
@@ -782,25 +788,13 @@ struct DevirtIndex {
} // end anonymous namespace
PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto AARGetter = [&](Function &F) -> AAResults & {
- return FAM.getResult<AAManager>(F);
- };
- auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
- return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
- };
- auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
- return FAM.getResult<DominatorTreeAnalysis>(F);
- };
+ ModuleAnalysisManager &MAM) {
if (UseCommandLine) {
- if (!DevirtModule::runForTesting(M, AARGetter, OREGetter, LookupDomTree))
+ if (!DevirtModule::runForTesting(M, MAM))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
- if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary,
- ImportSummary)
- .run())
+ if (!DevirtModule(M, MAM, ExportSummary, ImportSummary).run())
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
@@ -832,8 +826,8 @@ typeIDVisibleToRegularObj(StringRef TypeID,
// function for the base type and thus only contains a reference to the
// type info (_ZTI). To catch this case we query using the type info
// symbol corresponding to the TypeID.
- std::string typeInfo = ("_ZTI" + TypeID).str();
- return IsVisibleToRegularObj(typeInfo);
+ std::string TypeInfo = ("_ZTI" + TypeID).str();
+ return IsVisibleToRegularObj(TypeInfo);
}
static bool
@@ -842,7 +836,7 @@ skipUpdateDueToValidation(GlobalVariable &GV,
SmallVector<MDNode *, 2> Types;
GV.getMetadata(LLVMContext::MD_type, Types);
- for (auto Type : Types)
+ for (auto *Type : Types)
if (auto *TypeID = dyn_cast<MDString>(Type->getOperand(1).get()))
return typeIDVisibleToRegularObj(TypeID->getString(),
IsVisibleToRegularObj);
@@ -881,6 +875,7 @@ void llvm::updateVCallVisibilityInModule(
void llvm::updatePublicTypeTestCalls(Module &M,
bool WholeProgramVisibilityEnabledInLTO) {
+ llvm::TimeTraceScope timeScope("Update public type test calls");
Function *PublicTypeTestFunc =
Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
if (!PublicTypeTestFunc)
@@ -912,9 +907,9 @@ void llvm::getVisibleToRegularObjVtableGUIDs(
ModuleSummaryIndex &Index,
DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols,
function_ref<bool(StringRef)> IsVisibleToRegularObj) {
- for (const auto &typeID : Index.typeIdCompatibleVtableMap()) {
- if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj))
- for (const TypeIdOffsetVtableInfo &P : typeID.second)
+ for (const auto &TypeID : Index.typeIdCompatibleVtableMap()) {
+ if (typeIDVisibleToRegularObj(TypeID.first, IsVisibleToRegularObj))
+ for (const TypeIdOffsetVtableInfo &P : TypeID.second)
VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID());
}
}
@@ -957,7 +952,7 @@ void llvm::runWholeProgramDevirtOnIndex(
void llvm::updateIndexWPDForExports(
ModuleSummaryIndex &Summary,
- function_ref<bool(StringRef, ValueInfo)> isExported,
+ function_ref<bool(StringRef, ValueInfo)> IsExported,
std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
for (auto &T : LocalWPDTargetsMap) {
auto &VI = T.first;
@@ -965,7 +960,7 @@ void llvm::updateIndexWPDForExports(
assert(VI.getSummaryList().size() == 1 &&
"Devirt of local target has more than one copy");
auto &S = VI.getSummaryList()[0];
- if (!isExported(S->modulePath(), VI))
+ if (!IsExported(S->modulePath(), VI))
continue;
// It's been exported by a cross module import.
@@ -995,10 +990,7 @@ static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) {
return ErrorSuccess();
}
-bool DevirtModule::runForTesting(
- Module &M, function_ref<AAResults &(Function &)> AARGetter,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
- function_ref<DominatorTree &(Function &)> LookupDomTree) {
+bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) {
std::unique_ptr<ModuleSummaryIndex> Summary =
std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
@@ -1023,7 +1015,7 @@ bool DevirtModule::runForTesting(
}
bool Changed =
- DevirtModule(M, AARGetter, OREGetter, LookupDomTree,
+ DevirtModule(M, MAM,
ClSummaryAction == PassSummaryAction::Export ? Summary.get()
: nullptr,
ClSummaryAction == PassSummaryAction::Import ? Summary.get()
@@ -1071,7 +1063,7 @@ void DevirtModule::buildTypeIdentifierMap(
}
for (MDNode *Type : Types) {
- auto TypeID = Type->getOperand(1).get();
+ auto *TypeID = Type->getOperand(1).get();
uint64_t Offset =
cast<ConstantInt>(
@@ -1120,7 +1112,7 @@ bool DevirtModule::tryFindVirtualCallTargets(
// Save the symbol used in the vtable to use as the devirtualization
// target.
- auto GV = dyn_cast<GlobalValue>(C);
+ auto *GV = dyn_cast<GlobalValue>(C);
assert(GV);
TargetsForSlot.push_back({GV, &TM});
}
@@ -1284,7 +1276,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
Apply(P.second);
}
-static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) {
+static bool addCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) {
// We can't add calls if we haven't seen a definition
if (Callee.getSummaryList().empty())
return false;
@@ -1359,7 +1351,7 @@ bool DevirtModule::trySingleImplDevirt(
if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID()))
// Any needed promotion of 'TheFn' has already been done during
// LTO unit split, so we can ignore return value of AddCalls.
- AddCalls(SlotInfo, TheFnVI);
+ addCalls(SlotInfo, TheFnVI);
Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
Res->SingleImplName = std::string(TheFn->getName());
@@ -1400,7 +1392,7 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
DevirtTargets.insert(TheFn);
auto &S = TheFn.getSummaryList()[0];
- bool IsExported = AddCalls(SlotInfo, TheFn);
+ bool IsExported = addCalls(SlotInfo, TheFn);
if (IsExported)
ExportedGUIDs.insert(TheFn.getGUID());
@@ -1497,13 +1489,19 @@ void DevirtModule::tryICallBranchFunnel(
ReturnInst::Create(M.getContext(), nullptr, BB);
bool IsExported = false;
- applyICallBranchFunnel(SlotInfo, JT, IsExported);
+ applyICallBranchFunnel(SlotInfo, *JT, IsExported);
if (IsExported)
Res->TheKind = WholeProgramDevirtResolution::BranchFunnel;
+
+ if (!JT->getEntryCount().has_value()) {
+ // FIXME: we could pass through thinlto the necessary information.
+ setExplicitlyUnknownFunctionEntryCount(*JT);
+ }
}
void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
- Constant *JT, bool &IsExported) {
+ Function &JT, bool &IsExported) {
+ DenseMap<Function *, double> FunctionEntryCounts;
auto Apply = [&](CallSiteInfo &CSInfo) {
if (CSInfo.isExported())
IsExported = true;
@@ -1531,8 +1529,7 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
NumBranchFunnel++;
if (RemarksEnabled)
- VCallSite.emitRemark("branch-funnel",
- JT->stripPointerCasts()->getName(), OREGetter);
+ VCallSite.emitRemark("branch-funnel", JT.getName(), OREGetter);
// Pass the address of the vtable in the nest register, which is r10 on
// x86_64.
@@ -1548,11 +1545,28 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
llvm::append_range(Args, CB.args());
CallBase *NewCS = nullptr;
+ if (!JT.isDeclaration() && !ProfcheckDisableMetadataFixes) {
+ // Accumulate the call frequencies of the original call site, and use
+ // that as total entry count for the funnel function.
+ auto &F = *CB.getCaller();
+ auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+ auto EC = BFI.getBlockFreq(&F.getEntryBlock());
+ auto CC = F.getEntryCount(/*AllowSynthetic=*/true);
+ double CallCount = 0.0;
+ if (EC.getFrequency() != 0 && CC && CC->getCount() != 0) {
+ double CallFreq =
+ static_cast<double>(
+ BFI.getBlockFreq(CB.getParent()).getFrequency()) /
+ EC.getFrequency();
+ CallCount = CallFreq * CC->getCount();
+ }
+ FunctionEntryCounts[&JT] += CallCount;
+ }
if (isa<CallInst>(CB))
- NewCS = IRB.CreateCall(NewFT, JT, Args);
+ NewCS = IRB.CreateCall(NewFT, &JT, Args);
else
NewCS =
- IRB.CreateInvoke(NewFT, JT, cast<InvokeInst>(CB).getNormalDest(),
+ IRB.CreateInvoke(NewFT, &JT, cast<InvokeInst>(CB).getNormalDest(),
cast<InvokeInst>(CB).getUnwindDest(), Args);
NewCS->setCallingConv(CB.getCallingConv());
@@ -1586,6 +1600,11 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
Apply(SlotInfo.CSInfo);
for (auto &P : SlotInfo.ConstCSInfo)
Apply(P.second);
+ for (auto &[F, C] : FunctionEntryCounts) {
+ assert(!F->getEntryCount(/*AllowSynthetic=*/true) &&
+ "Unexpected entry count for funnel that was freshly synthesized");
+ F->setEntryCount(static_cast<uint64_t>(std::round(C)));
+ }
}
bool DevirtModule::tryEvaluateFunctionsWithArgs(
@@ -1597,7 +1616,7 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs(
// TODO: Skip for now if the vtable symbol was an alias to a function,
// need to evaluate whether it would be correct to analyze the aliasee
// function for this optimization.
- auto Fn = dyn_cast<Function>(Target.Fn);
+ auto *Fn = dyn_cast<Function>(Target.Fn);
if (!Fn)
return false;
@@ -1836,11 +1855,11 @@ bool DevirtModule::tryVirtualConstProp(
// TODO: Skip for now if the vtable symbol was an alias to a function,
// need to evaluate whether it would be correct to analyze the aliasee
// function for this optimization.
- auto Fn = dyn_cast<Function>(TargetsForSlot[0].Fn);
+ auto *Fn = dyn_cast<Function>(TargetsForSlot[0].Fn);
if (!Fn)
return false;
// This only works if the function returns an integer.
- auto RetType = dyn_cast<IntegerType>(Fn->getReturnType());
+ auto *RetType = dyn_cast<IntegerType>(Fn->getReturnType());
if (!RetType)
return false;
unsigned BitWidth = RetType->getBitWidth();
@@ -1871,12 +1890,12 @@ bool DevirtModule::tryVirtualConstProp(
// TODO: Skip for now if the vtable symbol was an alias to a function,
// need to evaluate whether it would be correct to analyze the aliasee
// function for this optimization.
- auto Fn = dyn_cast<Function>(Target.Fn);
+ auto *Fn = dyn_cast<Function>(Target.Fn);
if (!Fn)
return false;
if (Fn->isDeclaration() ||
- !computeFunctionBodyMemoryAccess(*Fn, AARGetter(*Fn))
+ !computeFunctionBodyMemoryAccess(*Fn, FAM.getResult<AAManager>(*Fn))
.doesNotAccessMemory() ||
Fn->arg_empty() || !Fn->arg_begin()->use_empty() ||
Fn->getReturnType() != RetType)
@@ -1992,11 +2011,11 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
// Build an anonymous global containing the before bytes, followed by the
// original initializer, followed by the after bytes.
- auto NewInit = ConstantStruct::getAnon(
+ auto *NewInit = ConstantStruct::getAnon(
{ConstantDataArray::get(M.getContext(), B.Before.Bytes),
B.GV->getInitializer(),
ConstantDataArray::get(M.getContext(), B.After.Bytes)});
- auto NewGV =
+ auto *NewGV =
new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(),
GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
NewGV->setSection(B.GV->getSection());
@@ -2009,7 +2028,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
// Build an alias named after the original global, pointing at the second
// element (the original initializer).
- auto Alias = GlobalAlias::create(
+ auto *Alias = GlobalAlias::create(
B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "",
ConstantExpr::getInBoundsGetElementPtr(
NewInit->getType(), NewGV,
@@ -2050,7 +2069,7 @@ void DevirtModule::scanTypeTestUsers(
// Search for virtual calls based on %p and add them to DevirtCalls.
SmallVector<DevirtCallSite, 1> DevirtCalls;
SmallVector<CallInst *, 1> Assumes;
- auto &DT = LookupDomTree(*CI->getFunction());
+ auto &DT = FAM.getResult<DominatorTreeAnalysis>(*CI->getFunction());
findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
Metadata *TypeId =
@@ -2127,7 +2146,7 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
SmallVector<Instruction *, 1> LoadedPtrs;
SmallVector<Instruction *, 1> Preds;
bool HasNonCallUses = false;
- auto &DT = LookupDomTree(*CI->getFunction());
+ auto &DT = FAM.getResult<DominatorTreeAnalysis>(*CI->getFunction());
findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
HasNonCallUses, CI, DT);
@@ -2259,18 +2278,18 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) {
// The type of the function is irrelevant, because it's bitcast at calls
// anyhow.
- Constant *JT = cast<Constant>(
+ auto *JT = cast<Function>(
M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"),
Type::getVoidTy(M.getContext()))
.getCallee());
bool IsExported = false;
- applyICallBranchFunnel(SlotInfo, JT, IsExported);
+ applyICallBranchFunnel(SlotInfo, *JT, IsExported);
assert(!IsExported);
}
}
void DevirtModule::removeRedundantTypeTests() {
- auto True = ConstantInt::getTrue(M.getContext());
+ auto *True = ConstantInt::getTrue(M.getContext());
for (auto &&U : NumUnsafeUsesForTypeTest) {
if (U.second == 0) {
U.first->replaceAllUsesWith(True);
@@ -2490,18 +2509,17 @@ bool DevirtModule::run() {
// Generate remarks for each devirtualized function.
for (const auto &DT : DevirtTargets) {
GlobalValue *GV = DT.second;
- auto F = dyn_cast<Function>(GV);
+ auto *F = dyn_cast<Function>(GV);
if (!F) {
- auto A = dyn_cast<GlobalAlias>(GV);
+ auto *A = dyn_cast<GlobalAlias>(GV);
assert(A && isa<Function>(A->getAliasee()));
F = dyn_cast<Function>(A->getAliasee());
assert(F);
}
using namespace ore;
- OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
- << "devirtualized "
- << NV("FunctionName", DT.first));
+ OREGetter(*F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
+ << "devirtualized " << NV("FunctionName", DT.first));
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d934638c15e7..f9155cc66031 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2115,6 +2115,7 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
}
// Find common base and collect RHS GEPs.
+ bool First = true;
while (true) {
if (Ptrs.contains(RHS)) {
Base.Ptr = RHS;
@@ -2123,7 +2124,12 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
if (auto *GEP = dyn_cast<GEPOperator>(RHS)) {
Base.RHSGEPs.push_back(GEP);
- Base.RHSNW &= GEP->getNoWrapFlags();
+ if (First) {
+ First = false;
+ Base.RHSNW = GEP->getNoWrapFlags();
+ } else {
+ Base.RHSNW = Base.RHSNW.intersectForOffsetAdd(GEP->getNoWrapFlags());
+ }
RHS = GEP->getPointerOperand();
} else {
// No common base.
@@ -2132,13 +2138,19 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
}
// Collect LHS GEPs.
+ First = true;
while (true) {
if (LHS == Base.Ptr)
break;
auto *GEP = cast<GEPOperator>(LHS);
Base.LHSGEPs.push_back(GEP);
- Base.LHSNW &= GEP->getNoWrapFlags();
+ if (First) {
+ First = false;
+ Base.LHSNW = GEP->getNoWrapFlags();
+ } else {
+ Base.LHSNW = Base.LHSNW.intersectForOffsetAdd(GEP->getNoWrapFlags());
+ }
LHS = GEP->getPointerOperand();
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a13d3ceb6132..2d7524e8018b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1799,16 +1799,21 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
// type may provide more information to later folds, and the smaller logic
// instruction may be cheaper (particularly in the case of vectors).
Value *X;
+ auto &DL = IC.getDataLayout();
if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {
- if (Constant *TruncC = IC.getLosslessUnsignedTrunc(C, SrcTy)) {
+ PreservedCastFlags Flags;
+ if (Constant *TruncC = getLosslessUnsignedTrunc(C, SrcTy, DL, &Flags)) {
// LogicOpc (zext X), C --> zext (LogicOpc X, C)
Value *NewOp = IC.Builder.CreateBinOp(LogicOpc, X, TruncC);
- return new ZExtInst(NewOp, DestTy);
+ auto *ZExt = new ZExtInst(NewOp, DestTy);
+ ZExt->setNonNeg(Flags.NNeg);
+ ZExt->andIRFlags(Cast);
+ return ZExt;
}
}
if (match(Cast, m_OneUse(m_SExtLike(m_Value(X))))) {
- if (Constant *TruncC = IC.getLosslessSignedTrunc(C, SrcTy)) {
+ if (Constant *TruncC = getLosslessSignedTrunc(C, SrcTy, DL)) {
// LogicOpc (sext X), C --> sext (LogicOpc X, C)
Value *NewOp = IC.Builder.CreateBinOp(LogicOpc, X, TruncC);
return new SExtInst(NewOp, DestTy);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 42b65dde6725..33b66aeaffe6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1956,7 +1956,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
Constant *C;
if (match(I0, m_ZExt(m_Value(X))) && match(I1, m_Constant(C)) &&
I0->hasOneUse()) {
- if (Constant *NarrowC = getLosslessUnsignedTrunc(C, X->getType())) {
+ if (Constant *NarrowC = getLosslessUnsignedTrunc(C, X->getType(), DL)) {
Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, NarrowC);
return CastInst::Create(Instruction::ZExt, NarrowMaxMin, II->getType());
}
@@ -2006,7 +2006,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
Constant *C;
if (match(I0, m_SExt(m_Value(X))) && match(I1, m_Constant(C)) &&
I0->hasOneUse()) {
- if (Constant *NarrowC = getLosslessSignedTrunc(C, X->getType())) {
+ if (Constant *NarrowC = getLosslessSignedTrunc(C, X->getType(), DL)) {
Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, NarrowC);
return CastInst::Create(Instruction::SExt, NarrowMaxMin, II->getType());
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index fdef49e310f8..ccf918f0b6db 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -11,11 +11,13 @@
//===----------------------------------------------------------------------===//
#include "InstCombineInternal.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Value.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include <optional>
@@ -969,6 +971,25 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
Changed = true;
}
+ const APInt *C1;
+ Value *V1;
+ // OP = { lshr, ashr }
+ // trunc ( OP i8 C1, V1) to i1 -> icmp eq V1, log_2(C1) iff C1 is power of 2
+ if (DestWidth == 1 && match(Src, m_Shr(m_Power2(C1), m_Value(V1)))) {
+ Value *Right = ConstantInt::get(V1->getType(), C1->countr_zero());
+ Value *Icmp = Builder.CreateICmpEQ(V1, Right);
+ return replaceInstUsesWith(Trunc, Icmp);
+ }
+
+ // OP = { lshr, ashr }
+ // trunc ( OP i8 C1, V1) to i1 -> icmp ult V1, log_2(C1 + 1) iff (C1 + 1) is
+ // power of 2
+ if (DestWidth == 1 && match(Src, m_Shr(m_LowBitMask(C1), m_Value(V1)))) {
+ Value *Right = ConstantInt::get(V1->getType(), C1->countr_one());
+ Value *Icmp = Builder.CreateICmpULT(V1, Right);
+ return replaceInstUsesWith(Trunc, Icmp);
+ }
+
return Changed ? &Trunc : nullptr;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 3a8e04303815..99ea04816681 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -19,6 +19,7 @@
#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/Utils/Local.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/ConstantRange.h"
@@ -110,75 +111,41 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
/// If AndCst is non-null, then the loaded value is masked with that constant
/// before doing the comparison. This handles cases like "A[i]&4 == 0".
Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
- LoadInst *LI, GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI,
- ConstantInt *AndCst) {
- if (LI->isVolatile() || LI->getType() != GEP->getResultElementType() ||
- !GV->getValueType()->isArrayTy() || !GV->isConstant() ||
+ LoadInst *LI, GetElementPtrInst *GEP, CmpInst &ICI, ConstantInt *AndCst) {
+ auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(GEP));
+ if (LI->isVolatile() || !GV || !GV->isConstant() ||
!GV->hasDefinitiveInitializer())
return nullptr;
- Type *GEPSrcEltTy = GEP->getSourceElementType();
- if (GEPSrcEltTy->isArrayTy())
- GEPSrcEltTy = GEPSrcEltTy->getArrayElementType();
- if (GV->getValueType()->getArrayElementType() != GEPSrcEltTy)
+ Type *EltTy = LI->getType();
+ TypeSize EltSize = DL.getTypeStoreSize(EltTy);
+ if (EltSize.isScalable())
return nullptr;
- Constant *Init = GV->getInitializer();
- if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
+ LinearExpression Expr = decomposeLinearExpression(DL, GEP);
+ if (!Expr.Index || Expr.BasePtr != GV || Expr.Offset.getBitWidth() > 64)
return nullptr;
- uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
- // Don't blow up on huge arrays.
- if (ArrayElementCount > MaxArraySizeForCombine)
- return nullptr;
+ Constant *Init = GV->getInitializer();
+ TypeSize GlobalSize = DL.getTypeAllocSize(Init->getType());
- // There are many forms of this optimization we can handle, for now, just do
- // the simple index into a single-dimensional array or elements of equal size.
- //
- // Require: GEP [n x i8] GV, 0, Idx {{, constant indices}}
- // Or: GEP i8 GV, Idx
+ Value *Idx = Expr.Index;
+ const APInt &Stride = Expr.Scale;
+ const APInt &ConstOffset = Expr.Offset;
- unsigned GEPIdxOp = 1;
- if (GEP->getSourceElementType()->isArrayTy()) {
- GEPIdxOp = 2;
- if (!match(GEP->getOperand(1), m_ZeroInt()))
- return nullptr;
- }
- if (GEP->getNumOperands() < GEPIdxOp + 1 ||
- isa<Constant>(GEP->getOperand(GEPIdxOp)))
+ // Allow an additional context offset, but only within the stride.
+ if (!ConstOffset.ult(Stride))
return nullptr;
- // Check that indices after the variable are constants and in-range for the
- // type they index. Collect the indices. This is typically for arrays of
- // structs.
- SmallVector<unsigned, 4> LaterIndices;
-
- Type *EltTy = Init->getType()->getArrayElementType();
- for (unsigned i = GEPIdxOp + 1, e = GEP->getNumOperands(); i != e; ++i) {
- ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
- if (!Idx)
- return nullptr; // Variable index.
-
- uint64_t IdxVal = Idx->getZExtValue();
- if ((unsigned)IdxVal != IdxVal)
- return nullptr; // Too large array index.
-
- if (StructType *STy = dyn_cast<StructType>(EltTy))
- EltTy = STy->getElementType(IdxVal);
- else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
- if (IdxVal >= ATy->getNumElements())
- return nullptr;
- EltTy = ATy->getElementType();
- } else {
- return nullptr; // Unknown type.
- }
-
- LaterIndices.push_back(IdxVal);
- }
+ // Don't handle overlapping loads for now.
+ if (!Stride.uge(EltSize.getFixedValue()))
+ return nullptr;
- Value *Idx = GEP->getOperand(GEPIdxOp);
- // If the index type is non-canonical, wait for it to be canonicalized.
- if (Idx->getType() != DL.getIndexType(GEP->getType()))
+ // Don't blow up on huge arrays.
+ uint64_t ArrayElementCount =
+ divideCeil((GlobalSize.getFixedValue() - ConstOffset.getZExtValue()),
+ Stride.getZExtValue());
+ if (ArrayElementCount > MaxArraySizeForCombine)
return nullptr;
enum { Overdefined = -3, Undefined = -2 };
@@ -211,18 +178,12 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
// Scan the array and see if one of our patterns matches.
Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
- for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
- Constant *Elt = Init->getAggregateElement(i);
+ APInt Offset = ConstOffset;
+ for (unsigned i = 0, e = ArrayElementCount; i != e; ++i, Offset += Stride) {
+ Constant *Elt = ConstantFoldLoadFromConst(Init, EltTy, Offset, DL);
if (!Elt)
return nullptr;
- // If this is indexing an array of structures, get the structure element.
- if (!LaterIndices.empty()) {
- Elt = ConstantFoldExtractValueInstruction(Elt, LaterIndices);
- if (!Elt)
- return nullptr;
- }
-
// If the element is masked, handle it.
if (AndCst) {
Elt = ConstantFoldBinaryOpOperands(Instruction::And, Elt, AndCst, DL);
@@ -309,19 +270,17 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
// Now that we've scanned the entire array, emit our new comparison(s). We
// order the state machines in complexity of the generated code.
- // If inbounds keyword is not present, Idx * ElementSize can overflow.
- // Let's assume that ElementSize is 2 and the wanted value is at offset 0.
+ // If inbounds keyword is not present, Idx * Stride can overflow.
+ // Let's assume that Stride is 2 and the wanted value is at offset 0.
// Then, there are two possible values for Idx to match offset 0:
// 0x00..00, 0x80..00.
// Emitting 'icmp eq Idx, 0' isn't correct in this case because the
// comparison is false if Idx was 0x80..00.
// We need to erase the highest countTrailingZeros(ElementSize) bits of Idx.
- unsigned ElementSize =
- DL.getTypeAllocSize(Init->getType()->getArrayElementType());
auto MaskIdx = [&](Value *Idx) {
- if (!GEP->isInBounds() && llvm::countr_zero(ElementSize) != 0) {
+ if (!Expr.Flags.isInBounds() && Stride.countr_zero() != 0) {
Value *Mask = Constant::getAllOnesValue(Idx->getType());
- Mask = Builder.CreateLShr(Mask, llvm::countr_zero(ElementSize));
+ Mask = Builder.CreateLShr(Mask, Stride.countr_zero());
Idx = Builder.CreateAnd(Idx, Mask);
}
return Idx;
@@ -1997,10 +1956,8 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
if (auto *C2 = dyn_cast<ConstantInt>(Y))
if (auto *LI = dyn_cast<LoadInst>(X))
if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
- if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
- if (Instruction *Res =
- foldCmpLoadFromIndexedGlobal(LI, GEP, GV, Cmp, C2))
- return Res;
+ if (Instruction *Res = foldCmpLoadFromIndexedGlobal(LI, GEP, Cmp, C2))
+ return Res;
if (!Cmp.isEquality())
return nullptr;
@@ -4353,10 +4310,9 @@ Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
// Try to optimize things like "A[i] > 4" to index computations.
if (GetElementPtrInst *GEP =
dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
- if (Instruction *Res =
- foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, GV, I))
- return Res;
+ if (Instruction *Res =
+ foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, I))
+ return Res;
break;
}
@@ -6375,7 +6331,7 @@ Instruction *InstCombinerImpl::foldICmpWithZextOrSext(ICmpInst &ICmp) {
// If a lossless truncate is possible...
Type *SrcTy = CastOp0->getSrcTy();
- Constant *Res = getLosslessTrunc(C, SrcTy, CastOp0->getOpcode());
+ Constant *Res = getLosslessInvCast(C, SrcTy, CastOp0->getOpcode(), DL);
if (Res) {
if (ICmp.isEquality())
return new ICmpInst(ICmp.getPredicate(), X, Res);
@@ -8837,10 +8793,9 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
break;
case Instruction::Load:
if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
- if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
- if (Instruction *Res = foldCmpLoadFromIndexedGlobal(
- cast<LoadInst>(LHSI), GEP, GV, I))
- return Res;
+ if (Instruction *Res =
+ foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, I))
+ return Res;
break;
case Instruction::FPTrunc:
if (Instruction *NV = foldFCmpFpTrunc(I, *LHSI, *RHSC))
@@ -8944,14 +8899,14 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
}
{
- Value *CanonLHS = nullptr, *CanonRHS = nullptr;
+ Value *CanonLHS = nullptr;
match(Op0, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonLHS)));
- match(Op1, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonRHS)));
-
// (canonicalize(x) == x) => (x == x)
if (CanonLHS == Op1)
return new FCmpInst(Pred, Op1, Op1, "", &I);
+ Value *CanonRHS = nullptr;
+ match(Op1, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonRHS)));
// (x == canonicalize(x)) => (x == x)
if (CanonRHS == Op0)
return new FCmpInst(Pred, Op0, Op0, "", &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 2340028ce93d..7a979c16da50 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -222,23 +222,6 @@ public:
bool fmulByZeroIsZero(Value *MulVal, FastMathFlags FMF,
const Instruction *CtxI) const;
- Constant *getLosslessTrunc(Constant *C, Type *TruncTy, unsigned ExtOp) {
- Constant *TruncC = ConstantExpr::getTrunc(C, TruncTy);
- Constant *ExtTruncC =
- ConstantFoldCastOperand(ExtOp, TruncC, C->getType(), DL);
- if (ExtTruncC && ExtTruncC == C)
- return TruncC;
- return nullptr;
- }
-
- Constant *getLosslessUnsignedTrunc(Constant *C, Type *TruncTy) {
- return getLosslessTrunc(C, TruncTy, Instruction::ZExt);
- }
-
- Constant *getLosslessSignedTrunc(Constant *C, Type *TruncTy) {
- return getLosslessTrunc(C, TruncTy, Instruction::SExt);
- }
-
std::optional<std::pair<Intrinsic::ID, SmallVector<Value *, 3>>>
convertOrOfShiftsToFunnelShift(Instruction &Or);
@@ -710,7 +693,7 @@ public:
bool foldAllocaCmp(AllocaInst *Alloca);
Instruction *foldCmpLoadFromIndexedGlobal(LoadInst *LI,
GetElementPtrInst *GEP,
- GlobalVariable *GV, CmpInst &ICI,
+ CmpInst &ICI,
ConstantInt *AndCst = nullptr);
Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
Constant *RHSC);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index d7310b1c741c..a9aacc707cc2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1642,10 +1642,11 @@ static Instruction *narrowUDivURem(BinaryOperator &I,
}
Constant *C;
+ auto &DL = IC.getDataLayout();
if (isa<Instruction>(N) && match(N, m_OneUse(m_ZExt(m_Value(X)))) &&
match(D, m_Constant(C))) {
// If the constant is the same in the smaller type, use the narrow version.
- Constant *TruncC = IC.getLosslessUnsignedTrunc(C, X->getType());
+ Constant *TruncC = getLosslessUnsignedTrunc(C, X->getType(), DL);
if (!TruncC)
return nullptr;
@@ -1656,7 +1657,7 @@ static Instruction *narrowUDivURem(BinaryOperator &I,
if (isa<Instruction>(D) && match(D, m_OneUse(m_ZExt(m_Value(X)))) &&
match(N, m_Constant(C))) {
// If the constant is the same in the smaller type, use the narrow version.
- Constant *TruncC = IC.getLosslessUnsignedTrunc(C, X->getType());
+ Constant *TruncC = getLosslessUnsignedTrunc(C, X->getType(), DL);
if (!TruncC)
return nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 6477141ab095..ed9a0be6981f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -841,7 +841,7 @@ Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) {
NumZexts++;
} else if (auto *C = dyn_cast<Constant>(V)) {
// Make sure that constants can fit in the new type.
- Constant *Trunc = getLosslessUnsignedTrunc(C, NarrowType);
+ Constant *Trunc = getLosslessUnsignedTrunc(C, NarrowType, DL);
if (!Trunc)
return nullptr;
NewIncoming.push_back(Trunc);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index eb4332fbc095..9467463d39c0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1993,6 +1993,63 @@ Value *InstCombinerImpl::foldSelectWithConstOpToBinOp(ICmpInst *Cmp,
return BinOp;
}
+/// Folds:
+/// %a_sub = call @llvm.usub.sat(x, IntConst1)
+/// %b_sub = call @llvm.usub.sat(y, IntConst2)
+/// %or = or %a_sub, %b_sub
+/// %cmp = icmp eq %or, 0
+/// %sel = select %cmp, 0, MostSignificantBit
+/// into:
+/// %a_sub' = usub.sat(x, IntConst1 - MostSignificantBit)
+/// %b_sub' = usub.sat(y, IntConst2 - MostSignificantBit)
+/// %or = or %a_sub', %b_sub'
+/// %and = and %or, MostSignificantBit
+/// Likewise, for vector arguments as well.
+static Instruction *foldICmpUSubSatWithAndForMostSignificantBitCmp(
+ SelectInst &SI, ICmpInst *ICI, InstCombiner::BuilderTy &Builder) {
+ if (!SI.hasOneUse() || !ICI->hasOneUse())
+ return nullptr;
+ CmpPredicate Pred;
+ Value *A, *B;
+ const APInt *Constant1, *Constant2;
+ if (!match(SI.getCondition(),
+ m_ICmp(Pred,
+ m_OneUse(m_Or(m_OneUse(m_Intrinsic<Intrinsic::usub_sat>(
+ m_Value(A), m_APInt(Constant1))),
+ m_OneUse(m_Intrinsic<Intrinsic::usub_sat>(
+ m_Value(B), m_APInt(Constant2))))),
+ m_Zero())))
+ return nullptr;
+
+ Value *TrueVal = SI.getTrueValue();
+ Value *FalseVal = SI.getFalseValue();
+ if (!(Pred == ICmpInst::ICMP_EQ &&
+ (match(TrueVal, m_Zero()) && match(FalseVal, m_SignMask()))) ||
+ (Pred == ICmpInst::ICMP_NE &&
+ (match(TrueVal, m_SignMask()) && match(FalseVal, m_Zero()))))
+ return nullptr;
+
+ auto *Ty = A->getType();
+ unsigned BW = Constant1->getBitWidth();
+ APInt MostSignificantBit = APInt::getSignMask(BW);
+
+ // Anything over MSB is negative
+ if (Constant1->isNonNegative() || Constant2->isNonNegative())
+ return nullptr;
+
+ APInt AdjAP1 = *Constant1 - MostSignificantBit + 1;
+ APInt AdjAP2 = *Constant2 - MostSignificantBit + 1;
+
+ auto *Adj1 = ConstantInt::get(Ty, AdjAP1);
+ auto *Adj2 = ConstantInt::get(Ty, AdjAP2);
+
+ Value *NewA = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, A, Adj1);
+ Value *NewB = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, B, Adj2);
+ Value *Or = Builder.CreateOr(NewA, NewB);
+ Constant *MSBConst = ConstantInt::get(Ty, MostSignificantBit);
+ return BinaryOperator::CreateAnd(Or, MSBConst);
+}
+
/// Visit a SelectInst that has an ICmpInst as its first operand.
Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
ICmpInst *ICI) {
@@ -2009,6 +2066,9 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
if (Instruction *NewSel =
tryToReuseConstantFromSelectInComparison(SI, *ICI, *this))
return NewSel;
+ if (Instruction *Folded =
+ foldICmpUSubSatWithAndForMostSignificantBitCmp(SI, ICI, Builder))
+ return Folded;
// NOTE: if we wanted to, this is where to detect integer MIN/MAX
bool Changed = false;
@@ -2315,7 +2375,7 @@ Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) {
// If the constant is the same after truncation to the smaller type and
// extension to the original type, we can narrow the select.
Type *SelType = Sel.getType();
- Constant *TruncC = getLosslessTrunc(C, SmallType, ExtOpcode);
+ Constant *TruncC = getLosslessInvCast(C, SmallType, ExtOpcode, DL);
if (TruncC && ExtInst->hasOneUse()) {
Value *TruncCVal = cast<Value>(TruncC);
if (ExtInst == Sel.getFalseValue())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index f17fecd430a6..aa030294ff1e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -795,8 +795,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
I->dropPoisonGeneratingFlags();
return I;
}
- Known.Zero.lshrInPlace(ShiftAmt);
- Known.One.lshrInPlace(ShiftAmt);
+ Known >>= ShiftAmt;
if (ShiftAmt)
Known.Zero.setHighBits(ShiftAmt); // high bits known zero.
} else {
@@ -1066,10 +1065,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
}
}
- Known.Zero = LHSKnown.Zero.shl(ShiftAmt) |
- RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
- Known.One = LHSKnown.One.shl(ShiftAmt) |
- RHSKnown.One.lshr(BitWidth - ShiftAmt);
+ LHSKnown <<= ShiftAmt;
+ RHSKnown >>= BitWidth - ShiftAmt;
+ Known = LHSKnown.unionWith(RHSKnown);
KnownBitsComputed = true;
break;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 5ee3bb1abe86..c2f045a2ab02 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2027,9 +2027,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
}
if (OneUse) {
- replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
- const_cast<PHINode &>(*NewPN),
- const_cast<PHINode &>(*PN), DT);
+ replaceAllDbgUsesWith(*PN, *NewPN, *PN, DT);
}
return replaceInstUsesWith(I, NewPN);
}
@@ -2570,7 +2568,7 @@ Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) {
Constant *WideC;
if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC)))
return nullptr;
- Constant *NarrowC = getLosslessTrunc(WideC, X->getType(), CastOpc);
+ Constant *NarrowC = getLosslessInvCast(WideC, X->getType(), CastOpc, DL);
if (!NarrowC)
return nullptr;
Y = NarrowC;
@@ -2676,6 +2674,62 @@ static Instruction *canonicalizeGEPOfConstGEPI8(GetElementPtrInst &GEP,
return nullptr;
}
+/// Combine constant offsets separated by variable offsets.
+/// ptradd (ptradd (ptradd p, C1), x), C2 -> ptradd (ptradd p, x), C1+C2
+static Instruction *combineConstantOffsets(GetElementPtrInst &GEP,
+ InstCombinerImpl &IC) {
+ if (!GEP.hasAllConstantIndices())
+ return nullptr;
+
+ GEPNoWrapFlags NW = GEPNoWrapFlags::all();
+ SmallVector<GetElementPtrInst *> Skipped;
+ auto *InnerGEP = dyn_cast<GetElementPtrInst>(GEP.getPointerOperand());
+ while (true) {
+ if (!InnerGEP)
+ return nullptr;
+
+ NW = NW.intersectForReassociate(InnerGEP->getNoWrapFlags());
+ if (InnerGEP->hasAllConstantIndices())
+ break;
+
+ if (!InnerGEP->hasOneUse())
+ return nullptr;
+
+ Skipped.push_back(InnerGEP);
+ InnerGEP = dyn_cast<GetElementPtrInst>(InnerGEP->getPointerOperand());
+ }
+
+ // The two constant offset GEPs are directly adjacent: Let normal offset
+ // merging handle it.
+ if (Skipped.empty())
+ return nullptr;
+
+ // FIXME: This one-use check is not strictly necessary. Consider relaxing it
+ // if profitable.
+ if (!InnerGEP->hasOneUse())
+ return nullptr;
+
+ // Don't bother with vector splats.
+ Type *Ty = GEP.getType();
+ if (InnerGEP->getType() != Ty)
+ return nullptr;
+
+ const DataLayout &DL = IC.getDataLayout();
+ APInt Offset(DL.getIndexTypeSizeInBits(Ty), 0);
+ if (!GEP.accumulateConstantOffset(DL, Offset) ||
+ !InnerGEP->accumulateConstantOffset(DL, Offset))
+ return nullptr;
+
+ IC.replaceOperand(*Skipped.back(), 0, InnerGEP->getPointerOperand());
+ for (GetElementPtrInst *SkippedGEP : Skipped)
+ SkippedGEP->setNoWrapFlags(NW);
+
+ return IC.replaceInstUsesWith(
+ GEP,
+ IC.Builder.CreatePtrAdd(Skipped.front(), IC.Builder.getInt(Offset), "",
+ NW.intersectForOffsetAdd(GEP.getNoWrapFlags())));
+}
+
Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
GEPOperator *Src) {
// Combine Indices - If the source pointer to this getelementptr instruction
@@ -2687,125 +2741,56 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
if (auto *I = canonicalizeGEPOfConstGEPI8(GEP, Src, *this))
return I;
- // For constant GEPs, use a more general offset-based folding approach.
- Type *PtrTy = Src->getType()->getScalarType();
- if (GEP.hasAllConstantIndices() &&
- (Src->hasOneUse() || Src->hasAllConstantIndices())) {
- // Split Src into a variable part and a constant suffix.
- gep_type_iterator GTI = gep_type_begin(*Src);
- Type *BaseType = GTI.getIndexedType();
- bool IsFirstType = true;
- unsigned NumVarIndices = 0;
- for (auto Pair : enumerate(Src->indices())) {
- if (!isa<ConstantInt>(Pair.value())) {
- BaseType = GTI.getIndexedType();
- IsFirstType = false;
- NumVarIndices = Pair.index() + 1;
- }
- ++GTI;
- }
-
- // Determine the offset for the constant suffix of Src.
- APInt Offset(DL.getIndexTypeSizeInBits(PtrTy), 0);
- if (NumVarIndices != Src->getNumIndices()) {
- // FIXME: getIndexedOffsetInType() does not handled scalable vectors.
- if (BaseType->isScalableTy())
- return nullptr;
-
- SmallVector<Value *> ConstantIndices;
- if (!IsFirstType)
- ConstantIndices.push_back(
- Constant::getNullValue(Type::getInt32Ty(GEP.getContext())));
- append_range(ConstantIndices, drop_begin(Src->indices(), NumVarIndices));
- Offset += DL.getIndexedOffsetInType(BaseType, ConstantIndices);
- }
-
- // Add the offset for GEP (which is fully constant).
- if (!GEP.accumulateConstantOffset(DL, Offset))
- return nullptr;
-
- // Convert the total offset back into indices.
- SmallVector<APInt> ConstIndices =
- DL.getGEPIndicesForOffset(BaseType, Offset);
- if (!Offset.isZero() || (!IsFirstType && !ConstIndices[0].isZero()))
- return nullptr;
-
- GEPNoWrapFlags NW = getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP));
- SmallVector<Value *> Indices(
- drop_end(Src->indices(), Src->getNumIndices() - NumVarIndices));
- for (const APInt &Idx : drop_begin(ConstIndices, !IsFirstType)) {
- Indices.push_back(ConstantInt::get(GEP.getContext(), Idx));
- // Even if the total offset is inbounds, we may end up representing it
- // by first performing a larger negative offset, and then a smaller
- // positive one. The large negative offset might go out of bounds. Only
- // preserve inbounds if all signs are the same.
- if (Idx.isNonNegative() != ConstIndices[0].isNonNegative())
- NW = NW.withoutNoUnsignedSignedWrap();
- if (!Idx.isNonNegative())
- NW = NW.withoutNoUnsignedWrap();
- }
-
- return replaceInstUsesWith(
- GEP, Builder.CreateGEP(Src->getSourceElementType(), Src->getOperand(0),
- Indices, "", NW));
- }
+ if (auto *I = combineConstantOffsets(GEP, *this))
+ return I;
if (Src->getResultElementType() != GEP.getSourceElementType())
return nullptr;
- SmallVector<Value*, 8> Indices;
-
// Find out whether the last index in the source GEP is a sequential idx.
bool EndsWithSequential = false;
for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
I != E; ++I)
EndsWithSequential = I.isSequential();
+ if (!EndsWithSequential)
+ return nullptr;
- // Can we combine the two pointer arithmetics offsets?
- if (EndsWithSequential) {
- // Replace: gep (gep %P, long B), long A, ...
- // With: T = long A+B; gep %P, T, ...
- Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
- Value *GO1 = GEP.getOperand(1);
-
- // If they aren't the same type, then the input hasn't been processed
- // by the loop above yet (which canonicalizes sequential index types to
- // intptr_t). Just avoid transforming this until the input has been
- // normalized.
- if (SO1->getType() != GO1->getType())
- return nullptr;
+ // Replace: gep (gep %P, long B), long A, ...
+ // With: T = long A+B; gep %P, T, ...
+ Value *SO1 = Src->getOperand(Src->getNumOperands() - 1);
+ Value *GO1 = GEP.getOperand(1);
- Value *Sum =
- simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
- // Only do the combine when we are sure the cost after the
- // merge is never more than that before the merge.
- if (Sum == nullptr)
- return nullptr;
+ // If they aren't the same type, then the input hasn't been processed
+ // by the loop above yet (which canonicalizes sequential index types to
+ // intptr_t). Just avoid transforming this until the input has been
+ // normalized.
+ if (SO1->getType() != GO1->getType())
+ return nullptr;
- Indices.append(Src->op_begin()+1, Src->op_end()-1);
- Indices.push_back(Sum);
- Indices.append(GEP.op_begin()+2, GEP.op_end());
- } else if (isa<Constant>(*GEP.idx_begin()) &&
- cast<Constant>(*GEP.idx_begin())->isNullValue() &&
- Src->getNumOperands() != 1) {
- // Otherwise we can do the fold if the first index of the GEP is a zero
- Indices.append(Src->op_begin()+1, Src->op_end());
- Indices.append(GEP.idx_begin()+1, GEP.idx_end());
- }
-
- // Don't create GEPs with more than one variable index.
- unsigned NumVarIndices =
- count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); });
- if (NumVarIndices > 1)
+ Value *Sum =
+ simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+ // Only do the combine when we are sure the cost after the
+ // merge is never more than that before the merge.
+ if (Sum == nullptr)
return nullptr;
- if (!Indices.empty())
- return replaceInstUsesWith(
- GEP, Builder.CreateGEP(
- Src->getSourceElementType(), Src->getOperand(0), Indices, "",
- getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP))));
+ SmallVector<Value *, 8> Indices;
+ Indices.append(Src->op_begin() + 1, Src->op_end() - 1);
+ Indices.push_back(Sum);
+ Indices.append(GEP.op_begin() + 2, GEP.op_end());
- return nullptr;
+ // Don't create GEPs with more than one non-zero index.
+ unsigned NumNonZeroIndices = count_if(Indices, [](Value *Idx) {
+ auto *C = dyn_cast<Constant>(Idx);
+ return !C || !C->isNullValue();
+ });
+ if (NumNonZeroIndices > 1)
+ return nullptr;
+
+ return replaceInstUsesWith(
+ GEP, Builder.CreateGEP(
+ Src->getSourceElementType(), Src->getOperand(0), Indices, "",
+ getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP))));
}
Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
@@ -3238,6 +3223,19 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
drop_end(Indices), "", GEP.getNoWrapFlags()));
}
+ // Strip leading zero indices.
+ auto *FirstIdx = dyn_cast<Constant>(Indices.front());
+ if (FirstIdx && FirstIdx->isNullValue() &&
+ !FirstIdx->getType()->isVectorTy()) {
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ ++GTI;
+ if (!GTI.isStruct())
+ return replaceInstUsesWith(GEP, Builder.CreateGEP(GTI.getIndexedType(),
+ GEP.getPointerOperand(),
+ drop_begin(Indices), "",
+ GEP.getNoWrapFlags()));
+ }
+
// Scalarize vector operands; prefer splat-of-gep.as canonical form.
// Note that this looses information about undef lanes; we run it after
// demanded bits to partially mitigate that loss.
@@ -3264,17 +3262,18 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
return replaceInstUsesWith(GEP, Res);
}
- bool SeenVarIndex = false;
+ bool SeenNonZeroIndex = false;
for (auto [IdxNum, Idx] : enumerate(Indices)) {
- if (isa<Constant>(Idx))
+ auto *C = dyn_cast<Constant>(Idx);
+ if (C && C->isNullValue())
continue;
- if (!SeenVarIndex) {
- SeenVarIndex = true;
+ if (!SeenNonZeroIndex) {
+ SeenNonZeroIndex = true;
continue;
}
- // GEP has multiple variable indices: Split it.
+ // GEP has multiple non-zero indices: Split it.
ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum);
Value *FrontGEP =
Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices,
@@ -4961,63 +4960,68 @@ Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) {
Value *
InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {
// Try to push freeze through instructions that propagate but don't produce
- // poison as far as possible. If an operand of freeze follows three
- // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
- // guaranteed-non-poison operands then push the freeze through to the one
- // operand that is not guaranteed non-poison. The actual transform is as
- // follows.
- // Op1 = ... ; Op1 can be posion
- // Op0 = Inst(Op1, NonPoisonOps...) ; Op0 has only one use and only have
- // ; single guaranteed-non-poison operands
+ // poison as far as possible. If an operand of freeze does not produce poison
+ // then push the freeze through to the operands that are not guaranteed
+ // non-poison. The actual transform is as follows.
+ // Op1 = ... ; Op1 can be poison
+ // Op0 = Inst(Op1, NonPoisonOps...)
// ... = Freeze(Op0)
// =>
// Op1 = ...
// Op1.fr = Freeze(Op1)
// ... = Inst(Op1.fr, NonPoisonOps...)
- auto *OrigOp = OrigFI.getOperand(0);
- auto *OrigOpInst = dyn_cast<Instruction>(OrigOp);
- // While we could change the other users of OrigOp to use freeze(OrigOp), that
- // potentially reduces their optimization potential, so let's only do this iff
- // the OrigOp is only used by the freeze.
- if (!OrigOpInst || !OrigOpInst->hasOneUse() || isa<PHINode>(OrigOp))
- return nullptr;
+ auto CanPushFreeze = [](Value *V) {
+ if (!isa<Instruction>(V) || isa<PHINode>(V))
+ return false;
- // We can't push the freeze through an instruction which can itself create
- // poison. If the only source of new poison is flags, we can simply
- // strip them (since we know the only use is the freeze and nothing can
- // benefit from them.)
- if (canCreateUndefOrPoison(cast<Operator>(OrigOp),
- /*ConsiderFlagsAndMetadata*/ false))
- return nullptr;
+ // We can't push the freeze through an instruction which can itself create
+ // poison. If the only source of new poison is flags, we can simply
+ // strip them (since we know the only use is the freeze and nothing can
+ // benefit from them.)
+ return !canCreateUndefOrPoison(cast<Operator>(V),
+ /*ConsiderFlagsAndMetadata*/ false);
+ };
- // If operand is guaranteed not to be poison, there is no need to add freeze
- // to the operand. So we first find the operand that is not guaranteed to be
- // poison.
- Value *MaybePoisonOperand = nullptr;
- for (Value *V : OrigOpInst->operands()) {
- if (isa<MetadataAsValue>(V) || isGuaranteedNotToBeUndefOrPoison(V) ||
- // Treat identical operands as a single operand.
- (MaybePoisonOperand && MaybePoisonOperand == V))
+ // Pushing freezes up long instruction chains can be expensive. Instead,
+ // we directly push the freeze all the way to the leaves. However, we leave
+ // deduplication of freezes on the same value for freezeOtherUses().
+ Use *OrigUse = &OrigFI.getOperandUse(0);
+ SmallPtrSet<Instruction *, 8> Visited;
+ SmallVector<Use *, 8> Worklist;
+ Worklist.push_back(OrigUse);
+ while (!Worklist.empty()) {
+ auto *U = Worklist.pop_back_val();
+ Value *V = U->get();
+ if (!CanPushFreeze(V)) {
+ // If we can't push through the original instruction, abort the transform.
+ if (U == OrigUse)
+ return nullptr;
+
+ auto *UserI = cast<Instruction>(U->getUser());
+ Builder.SetInsertPoint(UserI);
+ Value *Frozen = Builder.CreateFreeze(V, V->getName() + ".fr");
+ U->set(Frozen);
continue;
- if (!MaybePoisonOperand)
- MaybePoisonOperand = V;
- else
- return nullptr;
- }
+ }
- OrigOpInst->dropPoisonGeneratingAnnotations();
+ auto *I = cast<Instruction>(V);
+ if (!Visited.insert(I).second)
+ continue;
- // If all operands are guaranteed to be non-poison, we can drop freeze.
- if (!MaybePoisonOperand)
- return OrigOp;
+ // reverse() to emit freezes in a more natural order.
+ for (Use &Op : reverse(I->operands())) {
+ Value *OpV = Op.get();
+ if (isa<MetadataAsValue>(OpV) || isGuaranteedNotToBeUndefOrPoison(OpV))
+ continue;
+ Worklist.push_back(&Op);
+ }
- Builder.SetInsertPoint(OrigOpInst);
- Value *FrozenMaybePoisonOperand = Builder.CreateFreeze(
- MaybePoisonOperand, MaybePoisonOperand->getName() + ".fr");
+ I->dropPoisonGeneratingAnnotations();
+ this->Worklist.add(I);
+ }
- OrigOpInst->replaceUsesOfWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
- return OrigOp;
+ return OrigUse->get();
}
Instruction *InstCombinerImpl::foldFreezeIntoRecurrence(FreezeInst &FI,
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 50258af5e26c..42c3d4a4f4c4 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1219,7 +1219,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout());
// Check that size is known and can be stored in IntptrTy.
- if (!Size || !ConstantInt::isValueValidForType(IntptrTy, *Size))
+ // TODO: Add support for scalable vectors if possible.
+ if (!Size || Size->isScalable() ||
+ !ConstantInt::isValueValidForType(IntptrTy, *Size))
return;
bool DoPoison = (ID == Intrinsic::lifetime_end);
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 66cdbfcf998c..832592e7663b 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -212,6 +212,15 @@ static cl::opt<float>
"OR because of the hot percentile cutoff, if "
"both are supplied."));
+static cl::opt<bool> ClStaticLinking(
+ "hwasan-static-linking",
+ cl::desc("Don't use .note.hwasan.globals section to instrument globals "
+ "from loadable libraries. "
+ "Note: in static binaries, the global variables section can be "
+ "accessed directly via linker-provided "
+ "__start_hwasan_globals and __stop_hwasan_globals symbols"),
+ cl::Hidden, cl::init(false));
+
STATISTIC(NumTotalFuncs, "Number of total funcs");
STATISTIC(NumInstrumentedFuncs, "Number of instrumented funcs");
STATISTIC(NumNoProfileSummaryFuncs, "Number of funcs without PS");
@@ -335,6 +344,7 @@ private:
FunctionAnalysisManager &FAM) const;
void initializeModule();
void createHwasanCtorComdat();
+ void createHwasanNote();
void initializeCallbacks(Module &M);
@@ -533,20 +543,7 @@ void HWAddressSanitizerPass::printPipeline(
OS << '>';
}
-void HWAddressSanitizer::createHwasanCtorComdat() {
- std::tie(HwasanCtorFunction, std::ignore) =
- getOrCreateSanitizerCtorAndInitFunctions(
- M, kHwasanModuleCtorName, kHwasanInitName,
- /*InitArgTypes=*/{},
- /*InitArgs=*/{},
- // This callback is invoked when the functions are created the first
- // time. Hook them into the global ctors list in that case:
- [&](Function *Ctor, FunctionCallee) {
- Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
- Ctor->setComdat(CtorComdat);
- appendToGlobalCtors(M, Ctor, 0, Ctor);
- });
-
+void HWAddressSanitizer::createHwasanNote() {
// Create a note that contains pointers to the list of global
// descriptors. Adding a note to the output file will cause the linker to
// create a PT_NOTE program header pointing to the note that we can use to
@@ -630,6 +627,29 @@ void HWAddressSanitizer::createHwasanCtorComdat() {
appendToCompilerUsed(M, Dummy);
}
+void HWAddressSanitizer::createHwasanCtorComdat() {
+ std::tie(HwasanCtorFunction, std::ignore) =
+ getOrCreateSanitizerCtorAndInitFunctions(
+ M, kHwasanModuleCtorName, kHwasanInitName,
+ /*InitArgTypes=*/{},
+ /*InitArgs=*/{},
+ // This callback is invoked when the functions are created the first
+ // time. Hook them into the global ctors list in that case:
+ [&](Function *Ctor, FunctionCallee) {
+ Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
+ Ctor->setComdat(CtorComdat);
+ appendToGlobalCtors(M, Ctor, 0, Ctor);
+ });
+
+ // Do not create .note.hwasan.globals for static binaries, as it is only
+ // needed for instrumenting globals from dynamic libraries. In static
+ // binaries, the global variables section can be accessed directly via the
+ // __start_hwasan_globals and __stop_hwasan_globals symbols inserted by the
+ // linker.
+ if (!ClStaticLinking)
+ createHwasanNote();
+}
+
/// Module-level initialization.
///
/// inserts a call to __hwasan_init to the module's constructor list.
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index a9a0731f16d9..ecb2f2dbc552 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
+#include "llvm/ProfileData/DataAccessProf.h"
#include "llvm/ProfileData/InstrProf.h"
#include "llvm/ProfileData/InstrProfReader.h"
#include "llvm/ProfileData/MemProfCommon.h"
@@ -75,6 +76,10 @@ static cl::opt<unsigned> MinMatchedColdBytePercent(
"memprof-matching-cold-threshold", cl::init(100), cl::Hidden,
cl::desc("Min percent of cold bytes matched to hint allocation cold"));
+static cl::opt<bool> AnnotateStaticDataSectionPrefix(
+ "memprof-annotate-static-data-prefix", cl::init(false), cl::Hidden,
+ cl::desc("If true, annotate the static data section prefix"));
+
// Matching statistics
STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile.");
STATISTIC(NumOfMemProfMismatch,
@@ -90,6 +95,14 @@ STATISTIC(NumOfMemProfMatchedAllocs,
"Number of matched memory profile allocs.");
STATISTIC(NumOfMemProfMatchedCallSites,
"Number of matched memory profile callsites.");
+STATISTIC(NumOfMemProfHotGlobalVars,
+ "Number of global vars annotated with 'hot' section prefix.");
+STATISTIC(NumOfMemProfColdGlobalVars,
+ "Number of global vars annotated with 'unlikely' section prefix.");
+STATISTIC(NumOfMemProfUnknownGlobalVars,
+ "Number of global vars with unknown hotness (no section prefix).");
+STATISTIC(NumOfMemProfExplicitSectionGlobalVars,
+ "Number of global vars with user-specified section (not annotated).");
static void addCallsiteMetadata(Instruction &I,
ArrayRef<uint64_t> InlinedCallStack,
@@ -674,11 +687,12 @@ MemProfUsePass::MemProfUsePass(std::string MemoryProfileFile,
}
PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
- // Return immediately if the module doesn't contain any function.
- if (M.empty())
+ // Return immediately if the module doesn't contain any function or global
+ // variables.
+ if (M.empty() && M.globals().empty())
return PreservedAnalyses::all();
- LLVM_DEBUG(dbgs() << "Read in memory profile:");
+ LLVM_DEBUG(dbgs() << "Read in memory profile:\n");
auto &Ctx = M.getContext();
auto ReaderOrErr = IndexedInstrProfReader::create(MemoryProfileFileName, *FS);
if (Error E = ReaderOrErr.takeError()) {
@@ -703,6 +717,14 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
return PreservedAnalyses::all();
}
+ const bool Changed =
+ annotateGlobalVariables(M, MemProfReader->getDataAccessProfileData());
+
+ // If the module doesn't contain any function, return after we process all
+ // global variables.
+ if (M.empty())
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(*M.begin());
@@ -752,3 +774,95 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
return PreservedAnalyses::none();
}
+
+// Returns true iff the global variable has custom section either by
+// __attribute__((section("name")))
+// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
+// or #pragma clang section directives
+// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
+static bool hasExplicitSectionName(const GlobalVariable &GVar) {
+ if (GVar.hasSection())
+ return true;
+
+ auto Attrs = GVar.getAttributes();
+ if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
+ Attrs.hasAttribute("relro-section") ||
+ Attrs.hasAttribute("rodata-section"))
+ return true;
+ return false;
+}
+
+bool MemProfUsePass::annotateGlobalVariables(
+ Module &M, const memprof::DataAccessProfData *DataAccessProf) {
+ if (!AnnotateStaticDataSectionPrefix || M.globals().empty())
+ return false;
+
+ if (!DataAccessProf) {
+ M.getContext().diagnose(DiagnosticInfoPGOProfile(
+ MemoryProfileFileName.data(),
+ StringRef("Data access profiles not found in memprof. Ignore "
+ "-memprof-annotate-static-data-prefix."),
+ DS_Warning));
+ return false;
+ }
+
+ bool Changed = false;
+ // Iterate all global variables in the module and annotate them based on
+ // data access profiles. Note it's up to the linker to decide how to map input
+ // sections to output sections, and one conservative practice is to map
+ // unlikely-prefixed ones to unlikely output section, and map the rest
+ // (hot-prefixed or prefix-less) to the canonical output section.
+ for (GlobalVariable &GVar : M.globals()) {
+ assert(!GVar.getSectionPrefix().has_value() &&
+ "GVar shouldn't have section prefix yet");
+ if (GVar.isDeclarationForLinker())
+ continue;
+
+ if (hasExplicitSectionName(GVar)) {
+ ++NumOfMemProfExplicitSectionGlobalVars;
+ LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName()
+ << " has explicit section name. Skip annotating.\n");
+ continue;
+ }
+
+ StringRef Name = GVar.getName();
+ // Skip string literals as their mangled names don't stay stable across
+ // binary releases.
+ // TODO: Track string content hash in the profiles and compute it inside the
+ // compiler to categeorize the hotness string literals.
+ if (Name.starts_with(".str")) {
+
+ LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n");
+ continue;
+ }
+
+ // DataAccessProfRecord's get* methods will canonicalize the name under the
+ // hood before looking it up, so optimizer doesn't need to do it.
+ std::optional<DataAccessProfRecord> Record =
+ DataAccessProf->getProfileRecord(Name);
+ // Annotate a global variable as hot if it has non-zero sampled count, and
+ // annotate it as cold if it's seen in the profiled binary
+ // file but doesn't have any access sample.
+ // For logging, optimization remark emitter requires a llvm::Function, but
+ // it's not well defined how to associate a global variable with a function.
+ // So we just print out the static data section prefix in LLVM_DEBUG.
+ if (Record && Record->AccessCount > 0) {
+ ++NumOfMemProfHotGlobalVars;
+ GVar.setSectionPrefix("hot");
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "Global variable " << Name
+ << " is annotated as hot\n");
+ } else if (DataAccessProf->isKnownColdSymbol(Name)) {
+ ++NumOfMemProfColdGlobalVars;
+ GVar.setSectionPrefix("unlikely");
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "Global variable " << Name
+ << " is annotated as unlikely\n");
+ } else {
+ ++NumOfMemProfUnknownGlobalVars;
+ LLVM_DEBUG(dbgs() << "Global variable " << Name << " is not annotated\n");
+ }
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 27292d1a66c3..9899a2aae2b1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3263,7 +3263,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return true;
}
- /// Heuristically instrument unknown intrinsics.
+ /// Returns whether it was able to heuristically instrument unknown
+ /// intrinsics.
///
/// The main purpose of this code is to do something reasonable with all
/// random intrinsics we might encounter, most importantly - SIMD intrinsics.
@@ -3273,7 +3274,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
///
/// We special-case intrinsics where this approach fails. See llvm.bswap
/// handling as an example of that.
- bool handleUnknownIntrinsicUnlogged(IntrinsicInst &I) {
+ bool maybeHandleUnknownIntrinsicUnlogged(IntrinsicInst &I) {
unsigned NumArgOperands = I.arg_size();
if (NumArgOperands == 0)
return false;
@@ -3300,8 +3301,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return false;
}
- bool handleUnknownIntrinsic(IntrinsicInst &I) {
- if (handleUnknownIntrinsicUnlogged(I)) {
+ bool maybeHandleUnknownIntrinsic(IntrinsicInst &I) {
+ if (maybeHandleUnknownIntrinsicUnlogged(I)) {
if (ClDumpHeuristicInstructions)
dumpInst(I);
@@ -3860,7 +3861,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
//
// Three operands:
// <4 x i32> @llvm.x86.avx512.vpdpbusd.128
- // (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
+ // (<4 x i32> %s, <16 x i8> %a, <16 x i8> %b)
// (this is equivalent to multiply-add on %a and %b, followed by
// adding/"accumulating" %s. "Accumulation" stores the result in one
// of the source registers, but this accumulate vs. add distinction
@@ -3902,15 +3903,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
ReturnType->getPrimitiveSizeInBits());
if (I.arg_size() == 3) {
- assert(ParamType == ReturnType);
- assert(ParamType == I.getArgOperand(0)->getType());
+ [[maybe_unused]] auto *AccumulatorType =
+ cast<FixedVectorType>(I.getOperand(0)->getType());
+ assert(AccumulatorType == ReturnType);
}
FixedVectorType *ImplicitReturnType = ReturnType;
// Step 1: instrument multiplication of corresponding vector elements
if (EltSizeInBits) {
- ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy(
- EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
+ ImplicitReturnType = cast<FixedVectorType>(
+ getMMXVectorTy(EltSizeInBits * ReductionFactor,
+ ParamType->getPrimitiveSizeInBits()));
ParamType = cast<FixedVectorType>(
getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
@@ -3958,7 +3961,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Step 2: instrument horizontal add
// We don't need bit-precise horizontalReduce because we only want to check
- // if each pair of elements is fully zero.
+ // if each pair/quad of elements is fully zero.
// Cast to <4 x i32>.
Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType);
@@ -3968,7 +3971,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Constant::getNullValue(Horizontal->getType())),
ImplicitReturnType);
- // Cast it back to the required fake return type (<1 x i64>).
+ // Cast it back to the required fake return type (if MMX: <1 x i64>; for
+ // AVX, it is already correct).
if (EltSizeInBits)
OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
@@ -5262,7 +5266,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
handleShadowOr(I);
}
- void visitIntrinsicInst(IntrinsicInst &I) {
+ bool maybeHandleCrossPlatformIntrinsic(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
case Intrinsic::uadd_with_overflow:
case Intrinsic::sadd_with_overflow:
@@ -5342,6 +5346,32 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
handleVectorReduceWithStarterIntrinsic(I);
break;
+ case Intrinsic::scmp:
+ case Intrinsic::ucmp: {
+ handleShadowOr(I);
+ break;
+ }
+
+ case Intrinsic::fshl:
+ case Intrinsic::fshr:
+ handleFunnelShift(I);
+ break;
+
+ case Intrinsic::is_constant:
+ // The result of llvm.is.constant() is always defined.
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ break;
+
+ default:
+ return false;
+ }
+
+ return true;
+ }
+
+ bool maybeHandleX86SIMDIntrinsic(IntrinsicInst &I) {
+ switch (I.getIntrinsicID()) {
case Intrinsic::x86_sse_stmxcsr:
handleStmxcsr(I);
break;
@@ -5392,6 +5422,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}
+ // Convert Packed Single Precision Floating-Point Values
+ // to Packed Signed Doubleword Integer Values
+ //
+ // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
+ // (<16 x float>, <16 x i32>, i16, i32)
+ case Intrinsic::x86_avx512_mask_cvtps2dq_512:
+ handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false);
+ break;
+
// Convert Packed Double Precision Floating-Point Values
// to Packed Single Precision Floating-Point Values
case Intrinsic::x86_sse2_cvtpd2ps:
@@ -5492,23 +5531,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_mmx_psrli_q:
case Intrinsic::x86_mmx_psrai_w:
case Intrinsic::x86_mmx_psrai_d:
- case Intrinsic::aarch64_neon_rshrn:
- case Intrinsic::aarch64_neon_sqrshl:
- case Intrinsic::aarch64_neon_sqrshrn:
- case Intrinsic::aarch64_neon_sqrshrun:
- case Intrinsic::aarch64_neon_sqshl:
- case Intrinsic::aarch64_neon_sqshlu:
- case Intrinsic::aarch64_neon_sqshrn:
- case Intrinsic::aarch64_neon_sqshrun:
- case Intrinsic::aarch64_neon_srshl:
- case Intrinsic::aarch64_neon_sshl:
- case Intrinsic::aarch64_neon_uqrshl:
- case Intrinsic::aarch64_neon_uqrshrn:
- case Intrinsic::aarch64_neon_uqshl:
- case Intrinsic::aarch64_neon_uqshrn:
- case Intrinsic::aarch64_neon_urshl:
- case Intrinsic::aarch64_neon_ushl:
- // Not handled here: aarch64_neon_vsli (vector shift left and insert)
handleVectorShiftIntrinsic(I, /* Variable */ false);
break;
case Intrinsic::x86_avx2_psllv_d:
@@ -5621,19 +5643,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
//
// Multiply and Add Packed Signed and Unsigned Bytes
// < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
- // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // (< 4 x i32>, <16 x i8>, <16 x i8>)
// < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
- // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // (< 8 x i32>, <32 x i8>, <32 x i8>)
// <16 x i32> @llvm.x86.avx512.vpdpbusd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>)
+ // (<16 x i32>, <64 x i8>, <64 x i8>)
//
// Multiply and Add Unsigned and Signed Bytes With Saturation
// < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
- // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // (< 4 x i32>, <16 x i8>, <16 x i8>)
// < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
- // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // (< 8 x i32>, <32 x i8>, <32 x i8>)
// <16 x i32> @llvm.x86.avx512.vpdpbusds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>)
+ // (<16 x i32>, <64 x i8>, <64 x i8>)
//
// < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
// (< 4 x i32>, < 4 x i32>, < 4 x i32>)
@@ -5652,30 +5674,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
//
// These intrinsics are auto-upgraded into non-masked forms:
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <16 x i8>, <16 x i8>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <16 x i8>, <16 x i8>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <32 x i8>, <32 x i8>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <32 x i8>, <32 x i8>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <64 x i8>, <64 x i8>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <64 x i8>, <64 x i8>, i16)
//
// <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <16 x i8>, <16 x i8>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <16 x i8>, <16 x i8>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <32 x i8>, <32 x i8>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <32 x i8>, <32 x i8>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <64 x i8>, <64 x i8>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <64 x i8>, <64 x i8>, i16)
case Intrinsic::x86_avx512_vpdpbusd_128:
case Intrinsic::x86_avx512_vpdpbusd_256:
case Intrinsic::x86_avx512_vpdpbusd_512:
@@ -5930,7 +5952,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx512_max_pd_512: {
// These AVX512 variants contain the rounding mode as a trailing flag.
// Earlier variants do not have a trailing flag and are already handled
- // by maybeHandleSimpleNomemIntrinsic(I, 0) via handleUnknownIntrinsic.
+ // by maybeHandleSimpleNomemIntrinsic(I, 0) via
+ // maybeHandleUnknownIntrinsic.
[[maybe_unused]] bool Success =
maybeHandleSimpleNomemIntrinsic(I, /*trailingFlags=*/1);
assert(Success);
@@ -5988,15 +6011,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/*trailingVerbatimArgs=*/1);
break;
- // Convert Packed Single Precision Floating-Point Values
- // to Packed Signed Doubleword Integer Values
- //
- // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
- // (<16 x float>, <16 x i32>, i16, i32)
- case Intrinsic::x86_avx512_mask_cvtps2dq_512:
- handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false);
- break;
-
// AVX512 PMOV: Packed MOV, with truncation
// Precisely handled by applying the same intrinsic to the shadow
case Intrinsic::x86_avx512_mask_pmov_dw_512:
@@ -6074,15 +6088,33 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
handleAVXGF2P8Affine(I);
break;
- case Intrinsic::fshl:
- case Intrinsic::fshr:
- handleFunnelShift(I);
- break;
+ default:
+ return false;
+ }
- case Intrinsic::is_constant:
- // The result of llvm.is.constant() is always defined.
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
+ return true;
+ }
+
+ bool maybeHandleArmSIMDIntrinsic(IntrinsicInst &I) {
+ switch (I.getIntrinsicID()) {
+ case Intrinsic::aarch64_neon_rshrn:
+ case Intrinsic::aarch64_neon_sqrshl:
+ case Intrinsic::aarch64_neon_sqrshrn:
+ case Intrinsic::aarch64_neon_sqrshrun:
+ case Intrinsic::aarch64_neon_sqshl:
+ case Intrinsic::aarch64_neon_sqshlu:
+ case Intrinsic::aarch64_neon_sqshrn:
+ case Intrinsic::aarch64_neon_sqshrun:
+ case Intrinsic::aarch64_neon_srshl:
+ case Intrinsic::aarch64_neon_sshl:
+ case Intrinsic::aarch64_neon_uqrshl:
+ case Intrinsic::aarch64_neon_uqrshrn:
+ case Intrinsic::aarch64_neon_uqshl:
+ case Intrinsic::aarch64_neon_uqshrn:
+ case Intrinsic::aarch64_neon_urshl:
+ case Intrinsic::aarch64_neon_ushl:
+ // Not handled here: aarch64_neon_vsli (vector shift left and insert)
+ handleVectorShiftIntrinsic(I, /* Variable */ false);
break;
// TODO: handling max/min similarly to AND/OR may be more precise
@@ -6233,17 +6265,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}
- case Intrinsic::scmp:
- case Intrinsic::ucmp: {
- handleShadowOr(I);
- break;
- }
-
default:
- if (!handleUnknownIntrinsic(I))
- visitInstruction(I);
- break;
+ return false;
}
+
+ return true;
+ }
+
+ void visitIntrinsicInst(IntrinsicInst &I) {
+ if (maybeHandleCrossPlatformIntrinsic(I))
+ return;
+
+ if (maybeHandleX86SIMDIntrinsic(I))
+ return;
+
+ if (maybeHandleArmSIMDIntrinsic(I))
+ return;
+
+ if (maybeHandleUnknownIntrinsic(I))
+ return;
+
+ visitInstruction(I);
}
void visitLibAtomicLoad(CallBase &CB) {
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 1ddb8ae9518f..4acc3f2d8469 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -19,9 +19,11 @@
#include "llvm/Analysis/ConstraintSystem.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
@@ -170,10 +172,12 @@ struct State {
DominatorTree &DT;
LoopInfo &LI;
ScalarEvolution &SE;
+ TargetLibraryInfo &TLI;
SmallVector<FactOrCheck, 64> WorkList;
- State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE)
- : DT(DT), LI(LI), SE(SE) {}
+ State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE,
+ TargetLibraryInfo &TLI)
+ : DT(DT), LI(LI), SE(SE), TLI(TLI) {}
/// Process block \p BB and add known facts to work-list.
void addInfoFor(BasicBlock &BB);
@@ -1109,10 +1113,54 @@ void State::addInfoForInductions(BasicBlock &BB) {
}
}
+static bool getConstraintFromMemoryAccess(GetElementPtrInst &GEP,
+ uint64_t AccessSize,
+ CmpPredicate &Pred, Value *&A,
+ Value *&B, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
+ auto Offset = collectOffsets(cast<GEPOperator>(GEP), DL);
+ if (!Offset.NW.hasNoUnsignedWrap())
+ return false;
+
+ if (Offset.VariableOffsets.size() != 1)
+ return false;
+
+ uint64_t BitWidth = Offset.ConstantOffset.getBitWidth();
+ auto &[Index, Scale] = Offset.VariableOffsets.front();
+ // Bail out on non-canonical GEPs.
+ if (Index->getType()->getScalarSizeInBits() != BitWidth)
+ return false;
+
+ ObjectSizeOpts Opts;
+ // Workaround for gep inbounds, ptr null, idx.
+ Opts.NullIsUnknownSize = true;
+ // Be conservative since we are not clear on whether an out of bounds access
+ // to the padding is UB or not.
+ Opts.RoundToAlign = true;
+ std::optional<TypeSize> Size =
+ getBaseObjectSize(Offset.BasePtr, DL, &TLI, Opts);
+ if (!Size || Size->isScalable())
+ return false;
+
+ // Index * Scale + ConstOffset + AccessSize <= AllocSize
+ // With nuw flag, we know that the index addition doesn't have unsigned wrap.
+ // If (AllocSize - (ConstOffset + AccessSize)) wraps around, there is no valid
+ // value for Index.
+ APInt MaxIndex = (APInt(BitWidth, Size->getFixedValue() - AccessSize,
+ /*isSigned=*/false, /*implicitTrunc=*/true) -
+ Offset.ConstantOffset)
+ .udiv(Scale);
+ Pred = ICmpInst::ICMP_ULE;
+ A = Index;
+ B = ConstantInt::get(Index->getType(), MaxIndex);
+ return true;
+}
+
void State::addInfoFor(BasicBlock &BB) {
addInfoForInductions(BB);
+ auto &DL = BB.getDataLayout();
- // True as long as long as the current instruction is guaranteed to execute.
+ // True as long as the current instruction is guaranteed to execute.
bool GuaranteedToExecute = true;
// Queue conditions and assumes.
for (Instruction &I : BB) {
@@ -1127,6 +1175,38 @@ void State::addInfoFor(BasicBlock &BB) {
continue;
}
+ auto AddFactFromMemoryAccess = [&](Value *Ptr, Type *AccessType) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!GEP)
+ return;
+ TypeSize AccessSize = DL.getTypeStoreSize(AccessType);
+ if (!AccessSize.isFixed())
+ return;
+ if (GuaranteedToExecute) {
+ CmpPredicate Pred;
+ Value *A, *B;
+ if (getConstraintFromMemoryAccess(*GEP, AccessSize.getFixedValue(),
+ Pred, A, B, DL, TLI)) {
+ // The memory access is guaranteed to execute when BB is entered,
+ // hence the constraint holds on entry to BB.
+ WorkList.emplace_back(FactOrCheck::getConditionFact(
+ DT.getNode(I.getParent()), Pred, A, B));
+ }
+ } else {
+ WorkList.emplace_back(
+ FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I));
+ }
+ };
+
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ if (!LI->isVolatile())
+ AddFactFromMemoryAccess(LI->getPointerOperand(), LI->getAccessType());
+ }
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isVolatile())
+ AddFactFromMemoryAccess(SI->getPointerOperand(), SI->getAccessType());
+ }
+
auto *II = dyn_cast<IntrinsicInst>(&I);
Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic;
switch (ID) {
@@ -1420,7 +1500,7 @@ static std::optional<bool> checkCondition(CmpInst::Predicate Pred, Value *A,
LLVM_DEBUG(dbgs() << "Checking " << *CheckInst << "\n");
auto R = Info.getConstraintForSolving(Pred, A, B);
- if (R.empty() || !R.isValid(Info)){
+ if (R.empty() || !R.isValid(Info)) {
LLVM_DEBUG(dbgs() << " failed to decompose condition\n");
return std::nullopt;
}
@@ -1785,12 +1865,13 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info,
static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
ScalarEvolution &SE,
- OptimizationRemarkEmitter &ORE) {
+ OptimizationRemarkEmitter &ORE,
+ TargetLibraryInfo &TLI) {
bool Changed = false;
DT.updateDFSNumbers();
SmallVector<Value *> FunctionArgs(llvm::make_pointer_range(F.args()));
ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
- State S(DT, LI, SE);
+ State S(DT, LI, SE, TLI);
std::unique_ptr<Module> ReproducerModule(
DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr);
@@ -1960,6 +2041,26 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
}
continue;
}
+
+ auto &DL = F.getDataLayout();
+ auto AddFactsAboutIndices = [&](Value *Ptr, Type *AccessType) {
+ CmpPredicate Pred;
+ Value *A, *B;
+ if (getConstraintFromMemoryAccess(
+ *cast<GetElementPtrInst>(Ptr),
+ DL.getTypeStoreSize(AccessType).getFixedValue(), Pred, A, B, DL,
+ TLI))
+ AddFact(Pred, A, B);
+ };
+
+ if (auto *LI = dyn_cast<LoadInst>(CB.Inst)) {
+ AddFactsAboutIndices(LI->getPointerOperand(), LI->getAccessType());
+ continue;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(CB.Inst)) {
+ AddFactsAboutIndices(SI->getPointerOperand(), SI->getAccessType());
+ continue;
+ }
}
Value *A = nullptr, *B = nullptr;
@@ -2018,7 +2119,8 @@ PreservedAnalyses ConstraintEliminationPass::run(Function &F,
auto &LI = AM.getResult<LoopAnalysis>(F);
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- if (!eliminateConstraints(F, DT, LI, SE, ORE))
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ if (!eliminateConstraints(F, DT, LI, SE, ORE, TLI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 434b55868c99..944b253e0f5e 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -521,7 +521,7 @@ private:
Instruction *SIUse = dyn_cast<Instruction>(SI->user_back());
// The use of the select inst should be either a phi or another select.
- if (!SIUse && !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse)))
+ if (!SIUse || !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse)))
return false;
BasicBlock *SIBB = SI->getParent();
@@ -581,15 +581,17 @@ struct AllSwitchPaths {
VisitedBlocks VB;
// Get paths from the determinator BBs to SwitchPhiDefBB
std::vector<ThreadingPath> PathsToPhiDef =
- getPathsFromStateDefMap(StateDef, SwitchPhi, VB);
- if (SwitchPhiDefBB == SwitchBlock) {
+ getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
+ if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) {
TPaths = std::move(PathsToPhiDef);
return;
}
+ assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty());
+ auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
// Find and append paths from SwitchPhiDefBB to SwitchBlock.
PathsType PathsToSwitchBB =
- paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1);
+ paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
if (PathsToSwitchBB.empty())
return;
@@ -610,13 +612,16 @@ private:
typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap;
std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef,
PHINode *Phi,
- VisitedBlocks &VB) {
+ VisitedBlocks &VB,
+ unsigned PathsLimit) {
std::vector<ThreadingPath> Res;
auto *PhiBB = Phi->getParent();
VB.insert(PhiBB);
VisitedBlocks UniqueBlocks;
for (auto *IncomingBB : Phi->blocks()) {
+ if (Res.size() >= PathsLimit)
+ break;
if (!UniqueBlocks.insert(IncomingBB).second)
continue;
if (!SwitchOuterLoop->contains(IncomingBB))
@@ -652,8 +657,9 @@ private:
// Direct predecessor, just add to the path.
if (IncomingPhiDefBB == IncomingBB) {
- std::vector<ThreadingPath> PredPaths =
- getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+ assert(PathsLimit > Res.size());
+ std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap(
+ StateDef, IncomingPhi, VB, PathsLimit - Res.size());
for (ThreadingPath &Path : PredPaths) {
Path.push_back(PhiBB);
Res.push_back(std::move(Path));
@@ -666,13 +672,17 @@ private:
continue;
PathsType IntermediatePaths;
- IntermediatePaths =
- paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1);
+ assert(PathsLimit > Res.size());
+ auto InterPathLimit = PathsLimit - Res.size();
+ IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB,
+ /* PathDepth = */ 1, InterPathLimit);
if (IntermediatePaths.empty())
continue;
+ assert(InterPathLimit >= IntermediatePaths.size());
+ auto PredPathLimit = InterPathLimit / IntermediatePaths.size();
std::vector<ThreadingPath> PredPaths =
- getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+ getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit);
for (const ThreadingPath &Path : PredPaths) {
for (const PathType &IPath : IntermediatePaths) {
ThreadingPath NewPath(Path);
@@ -687,7 +697,7 @@ private:
}
PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited,
- unsigned PathDepth) {
+ unsigned PathDepth, unsigned PathsLimit) {
PathsType Res;
// Stop exploring paths after visiting MaxPathLength blocks
@@ -714,6 +724,8 @@ private:
// is used to prevent a duplicate path from being generated
SmallPtrSet<BasicBlock *, 4> Successors;
for (BasicBlock *Succ : successors(BB)) {
+ if (Res.size() >= PathsLimit)
+ break;
if (!Successors.insert(Succ).second)
continue;
@@ -735,14 +747,12 @@ private:
// coverage and compile time.
if (LI->getLoopFor(Succ) != CurrLoop)
continue;
-
- PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1);
+ assert(PathsLimit > Res.size());
+ PathsType SuccPaths =
+ paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size());
for (PathType &Path : SuccPaths) {
Path.push_front(BB);
Res.push_back(Path);
- if (Res.size() >= MaxNumPaths) {
- return Res;
- }
}
}
// This block could now be visited again from a different predecessor. Note
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 4baa3b3eb824..26e17cc849bf 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2982,7 +2982,8 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
bool GVNPass::performScalarPRE(Instruction *CurInst) {
if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
- CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects())
+ CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+ CurInst->getType()->isTokenLikeTy())
return false;
// Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index e9bf59c6850a..b60b15b6c3a2 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -15,6 +15,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -35,8 +36,38 @@ static bool tryToImproveAlign(
return true;
}
}
- // TODO: Also handle memory intrinsics.
- return false;
+
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+ if (!II)
+ return false;
+
+ // TODO: Handle more memory intrinsics.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::masked_load:
+ case Intrinsic::masked_store: {
+ int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2;
+ Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load
+ ? II->getArgOperand(0)
+ : II->getArgOperand(1);
+ Type *Type = II->getIntrinsicID() == Intrinsic::masked_load
+ ? II->getType()
+ : II->getArgOperand(0)->getType();
+
+ Align OldAlign =
+ cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue();
+ Align PrefAlign = DL.getPrefTypeAlign(Type);
+ Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign);
+ if (NewAlign <= OldAlign)
+ return false;
+
+ Value *V =
+ ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value());
+ II->setOperand(AlignOpIdx, V);
+ return true;
+ }
+ default:
+ return false;
+ }
}
bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index c2a737d8f9a4..c7d71eb5633e 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1437,9 +1437,18 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
// AvailablePreds vector as we go so that all of the PHI entries for this
// predecessor use the same bitcast.
Value *&PredV = I->second;
- if (PredV->getType() != LoadI->getType())
+ if (PredV->getType() != LoadI->getType()) {
PredV = CastInst::CreateBitOrPointerCast(
PredV, LoadI->getType(), "", P->getTerminator()->getIterator());
+ // The new cast is producing the value used to replace the load
+ // instruction, so uses the load's debug location. If P does not always
+ // branch to the load BB however then the debug location must be dropped,
+ // as it is hoisted past a conditional branch.
+ DebugLoc DL = P->getTerminator()->getNumSuccessors() == 1
+ ? LoadI->getDebugLoc()
+ : DebugLoc::getDropped();
+ cast<CastInst>(PredV)->setDebugLoc(DL);
+ }
PN->addIncoming(PredV, I->first);
}
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 03b92d3338a9..0874b29ab7d2 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -39,6 +39,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/HashRecognize.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -143,6 +144,14 @@ static cl::opt<bool, true>
cl::location(DisableLIRP::Wcslen), cl::init(false),
cl::ReallyHidden);
+bool DisableLIRP::HashRecognize;
+static cl::opt<bool, true>
+ DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize",
+ cl::desc("Proceed with loop idiom recognize pass, "
+ "but do not optimize CRC loops."),
+ cl::location(DisableLIRP::HashRecognize),
+ cl::init(false), cl::ReallyHidden);
+
static cl::opt<bool> UseLIRCodeSizeHeurs(
"use-lir-code-size-heurs",
cl::desc("Use loop idiom recognition code size heuristics when compiling "
@@ -242,6 +251,7 @@ private:
const SCEV *BECount);
bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
bool IsLoopMemset = false);
+ bool optimizeCRCLoop(const PolynomialInfo &Info);
/// @}
/// \name Noncountable Loop Idiom Handling
@@ -287,6 +297,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
// but ORE cannot be preserved (see comment before the pass definition).
OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+ std::optional<PolynomialInfo> HR;
+
LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
AR.MSSA, DL, ORE);
if (!LIR.runOnLoop(&L))
@@ -335,7 +347,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
HasMemcpy = TLI->has(LibFunc_memcpy);
- if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy)
+ if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic ||
+ HasMemcpy || !DisableLIRP::HashRecognize)
if (SE->hasLoopInvariantBackedgeTakenCount(L))
return runOnCountableLoop();
@@ -378,6 +391,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
}
+
+ // Optimize a CRC loop if HashRecognize found one, provided we're not
+ // optimizing for size.
+ if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics)
+ if (auto Res = HashRecognize(*CurLoop, *SE).getResult())
+ optimizeCRCLoop(*Res);
+
return MadeChange;
}
@@ -1514,6 +1534,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
return false;
}
+bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
+ // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using
+ // carry-less multiplication instructions, which is more efficient than our
+ // Sarwate table-lookup optimization. Hence, until we're able to emit
+ // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom,
+ // disable the optimization for Hexagon.
+ Module &M = *CurLoop->getHeader()->getModule();
+ Triple TT(M.getTargetTriple());
+ if (TT.getArch() == Triple::hexagon)
+ return false;
+
+ // First, create a new GlobalVariable corresponding to the
+ // Sarwate-lookup-table.
+ Type *CRCTy = Info.LHS->getType();
+ unsigned CRCBW = CRCTy->getIntegerBitWidth();
+ std::array<Constant *, 256> CRCConstants;
+ transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped),
+ CRCConstants.begin(),
+ [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); });
+ Constant *ConstArray =
+ ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants);
+ GlobalVariable *GV =
+ new GlobalVariable(M, ConstArray->getType(), true,
+ GlobalValue::PrivateLinkage, ConstArray, ".crctable");
+
+ PHINode *IV = CurLoop->getCanonicalInductionVariable();
+ SmallVector<PHINode *, 2> Cleanup;
+
+ // Next, mark all PHIs for removal except IV.
+ {
+ for (PHINode &PN : CurLoop->getHeader()->phis()) {
+ if (&PN == IV)
+ continue;
+ PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
+ Cleanup.push_back(&PN);
+ }
+ }
+
+ // Next, fix up the trip count.
+ {
+ unsigned NewBTC = (Info.TripCount / 8) - 1;
+ BasicBlock *LoopBlk = CurLoop->getLoopLatch();
+ BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator());
+ CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk
+ ? ICmpInst::Predicate::ICMP_NE
+ : ICmpInst::Predicate::ICMP_EQ;
+ Instruction *ExitCond = CurLoop->getLatchCmpInst();
+ Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC);
+ IRBuilder<> Builder(ExitCond);
+ Value *NewExitCond =
+ Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond");
+ ExitCond->replaceAllUsesWith(NewExitCond);
+ deleteDeadInstruction(ExitCond);
+ }
+
+ // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all
+ // uses of ComputedValue.
+ //
+ // Little-endian:
+ // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)]
+ // Big-Endian:
+ // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]
+ {
+ auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {
+ Type *OpTy = Op->getType();
+ unsigned OpBW = OpTy->getIntegerBitWidth();
+ return OpBW > 8
+ ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name)
+ : Op;
+ };
+ auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,
+ const Twine &Name) {
+ Type *OpTy = Op->getType();
+
+ // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to
+ // use the CRC's bitwidth as the reference for shifting right.
+ return LoByte(Builder,
+ CRCBW > 8 ? Builder.CreateLShr(
+ Op, ConstantInt::get(OpTy, CRCBW - 8), Name)
+ : Op,
+ Name + ".lo.byte");
+ };
+
+ IRBuilder<> Builder(CurLoop->getHeader(),
+ CurLoop->getHeader()->getFirstNonPHIIt());
+
+ // Create the CRC PHI, and initialize its incoming value to the initial
+ // value of CRC.
+ PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc");
+ CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader());
+
+ // CRC is now an evolving variable, initialized to the PHI.
+ Value *CRC = CRCPhi;
+
+ // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte
+ // of LHSAux), if LHSAux is non-nullptr.
+ Value *Indexer = CRC;
+ if (Value *Data = Info.LHSAux) {
+ Type *DataTy = Data->getType();
+
+ // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we
+ // shift right by that amount, and take the lo-byte (in the little-endian
+ // case), or shift left by that amount, and take the hi-idx (in the
+ // big-endian case).
+ Value *IVBits = Builder.CreateZExtOrTrunc(
+ Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer");
+ Value *DataIndexer =
+ Info.ByteOrderSwapped
+ ? Builder.CreateShl(Data, IVBits, "data.indexer")
+ : Builder.CreateLShr(Data, IVBits, "data.indexer");
+ Indexer = Builder.CreateXor(
+ DataIndexer,
+ Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"),
+ "crc.data.indexer");
+ }
+
+ Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi")
+ : LoByte(Builder, Indexer, "indexer.lo");
+
+ // Always index into a GEP using the index type.
+ Indexer = Builder.CreateZExt(
+ Indexer, SE->getDataLayout().getIndexType(GV->getType()),
+ "indexer.ext");
+
+ // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC].
+ Value *CRCTableGEP =
+ Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd");
+ Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld");
+
+ // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of
+ // CRC-8.
+ Value *CRCNext = CRCTableLd;
+ if (CRCBW > 8) {
+ Value *CRCShift = Info.ByteOrderSwapped
+ ? Builder.CreateShl(CRC, 8, "crc.be.shift")
+ : Builder.CreateLShr(CRC, 8, "crc.le.shift");
+ CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next");
+ }
+
+ // Connect the back-edge for the loop, and RAUW the ComputedValue.
+ CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch());
+ Info.ComputedValue->replaceUsesOutsideBlock(CRCNext,
+ CurLoop->getLoopLatch());
+ }
+
+ // Cleanup.
+ {
+ for (PHINode *PN : Cleanup)
+ RecursivelyDeleteDeadPHINode(PN);
+ SE->forgetLoop(CurLoop);
+ }
+ return true;
+}
+
bool LoopIdiomRecognize::runOnNoncountableLoop() {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName()
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f7d2258e1c28..2bda9d83236e 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -220,6 +220,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
UP.RuntimeUnrollMultiExit = false;
+ UP.AddAdditionalAccumulators = false;
// Override with any target specific settings
TTI.getUnrollingPreferences(L, SE, UP, &ORE);
@@ -1354,6 +1355,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
ULO.Heart = getLoopConvergenceHeart(L);
ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
+ ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators;
LoopUnrollResult UnrollResult = UnrollLoop(
L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
if (UnrollResult == LoopUnrollResult::Unmodified)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 8b9d06d7e443..8a5569743ab4 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -247,8 +247,8 @@ private:
/// index I' according to UserChain produced by function "find".
///
/// The building conceptually takes two steps:
- /// 1) iteratively distribute s/zext towards the leaves of the expression tree
- /// that computes I
+ /// 1) iteratively distribute sext/zext/trunc towards the leaves of the
+ /// expression tree that computes I
/// 2) reassociate the expression tree to the form I' + C.
///
/// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
@@ -260,29 +260,30 @@ private:
Value *rebuildWithoutConstOffset();
/// After the first step of rebuilding the GEP index without the constant
- /// offset, distribute s/zext to the operands of all operators in UserChain.
- /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+ /// offset, distribute sext/zext/trunc to the operands of all operators in
+ /// UserChain. e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
/// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
///
/// The function also updates UserChain to point to new subexpressions after
- /// distributing s/zext. e.g., the old UserChain of the above example is
- /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+ /// distributing sext/zext/trunc. e.g., the old UserChain of the above example
+ /// is
+ /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
/// and the new UserChain is
- /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
- /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+ /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+ /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
///
/// \p ChainIndex The index to UserChain. ChainIndex is initially
/// UserChain.size() - 1, and is decremented during
/// the recursion.
- Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+ Value *distributeCastsAndCloneChain(unsigned ChainIndex);
/// Reassociates the GEP index to the form I' + C and returns I'.
Value *removeConstOffset(unsigned ChainIndex);
- /// A helper function to apply ExtInsts, a list of s/zext, to value V.
- /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+ /// A helper function to apply CastInsts, a list of sext/zext/trunc, to value
+ /// V. e.g., if CastInsts = [sext i32 to i64, zext i16 to i32], this function
/// returns "sext i32 (zext i16 V to i32) to i64".
- Value *applyExts(Value *V);
+ Value *applyCasts(Value *V);
/// A helper function that returns whether we can trace into the operands
/// of binary operator BO for a constant offset.
@@ -307,8 +308,8 @@ private:
SmallVector<User *, 8> UserChain;
/// A data structure used in rebuildWithoutConstOffset. Contains all
- /// sext/zext instructions along UserChain.
- SmallVector<CastInst *, 16> ExtInsts;
+ /// sext/zext/trunc instructions along UserChain.
+ SmallVector<CastInst *, 16> CastInsts;
/// Insertion position of cloned instructions.
BasicBlock::iterator IP;
@@ -491,7 +492,7 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
}
Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
- // Do not trace into "or" unless it is equivalent to "add".
+ // Do not trace into "or" unless it is equivalent to "add nuw nsw".
// This is the case if the or's disjoint flag is set.
if (BO->getOpcode() == Instruction::Or &&
!cast<PossiblyDisjointInst>(BO)->isDisjoint())
@@ -503,8 +504,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
if (ZeroExtended && !SignExtended && BO->getOpcode() == Instruction::Sub)
return false;
- // In addition, tracing into BO requires that its surrounding s/zext (if
- // any) is distributable to both operands.
+ // In addition, tracing into BO requires that its surrounding sext/zext/trunc
+ // (if any) is distributable to both operands.
//
// Suppose BO = A op B.
// SignExtended | ZeroExtended | Distributable?
@@ -628,11 +629,11 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
return ConstantOffset;
}
-Value *ConstantOffsetExtractor::applyExts(Value *V) {
+Value *ConstantOffsetExtractor::applyCasts(Value *V) {
Value *Current = V;
- // ExtInsts is built in the use-def order. Therefore, we apply them to V
+ // CastInsts is built in the use-def order. Therefore, we apply them to V
// in the reversed order.
- for (CastInst *I : llvm::reverse(ExtInsts)) {
+ for (CastInst *I : llvm::reverse(CastInsts)) {
if (Constant *C = dyn_cast<Constant>(Current)) {
// Try to constant fold the cast.
Current = ConstantFoldCastOperand(I->getOpcode(), C, I->getType(), DL);
@@ -640,24 +641,24 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {
continue;
}
- Instruction *Ext = I->clone();
- Ext->setOperand(0, Current);
+ Instruction *Cast = I->clone();
+ Cast->setOperand(0, Current);
// In ConstantOffsetExtractor::find we do not analyze nuw/nsw for trunc, so
// we assume that it is ok to redistribute trunc over add/sub/or. But for
// example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc
// nuw (add A, B))). To make such redistributions legal we drop all the
// poison generating flags from cloned trunc instructions here.
- if (isa<TruncInst>(Ext))
- Ext->dropPoisonGeneratingFlags();
- Ext->insertBefore(*IP->getParent(), IP);
- Current = Ext;
+ if (isa<TruncInst>(Cast))
+ Cast->dropPoisonGeneratingFlags();
+ Cast->insertBefore(*IP->getParent(), IP);
+ Current = Cast;
}
return Current;
}
Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
- distributeExtsAndCloneChain(UserChain.size() - 1);
- // Remove all nullptrs (used to be s/zext) from UserChain.
+ distributeCastsAndCloneChain(UserChain.size() - 1);
+ // Remove all nullptrs (used to be sext/zext/trunc) from UserChain.
unsigned NewSize = 0;
for (User *I : UserChain) {
if (I != nullptr) {
@@ -670,29 +671,29 @@ Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
}
Value *
-ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+ConstantOffsetExtractor::distributeCastsAndCloneChain(unsigned ChainIndex) {
User *U = UserChain[ChainIndex];
if (ChainIndex == 0) {
assert(isa<ConstantInt>(U));
- // If U is a ConstantInt, applyExts will return a ConstantInt as well.
- return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+ // If U is a ConstantInt, applyCasts will return a ConstantInt as well.
+ return UserChain[ChainIndex] = cast<ConstantInt>(applyCasts(U));
}
if (CastInst *Cast = dyn_cast<CastInst>(U)) {
assert(
(isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
"Only following instructions can be traced: sext, zext & trunc");
- ExtInsts.push_back(Cast);
+ CastInsts.push_back(Cast);
UserChain[ChainIndex] = nullptr;
- return distributeExtsAndCloneChain(ChainIndex - 1);
+ return distributeCastsAndCloneChain(ChainIndex - 1);
}
// Function find only trace into BinaryOperator and CastInst.
BinaryOperator *BO = cast<BinaryOperator>(U);
// OpNo = which operand of BO is UserChain[ChainIndex - 1]
unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
- Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
- Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+ Value *TheOther = applyCasts(BO->getOperand(1 - OpNo));
+ Value *NextInChain = distributeCastsAndCloneChain(ChainIndex - 1);
BinaryOperator *NewBO = nullptr;
if (OpNo == 0) {
@@ -713,7 +714,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
assert((BO->use_empty() || BO->hasOneUse()) &&
- "distributeExtsAndCloneChain clones each BinaryOperator in "
+ "distributeCastsAndCloneChain clones each BinaryOperator in "
"UserChain, so no one should be used more than "
"once");
@@ -847,7 +848,8 @@ static bool allowsPreservingNUW(const User *U) {
// "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)"
if (const TruncInst *TI = dyn_cast<TruncInst>(U))
return TI->hasNoUnsignedWrap();
- return isa<CastInst>(U) || isa<ConstantInt>(U);
+ assert((isa<CastInst>(U) || isa<ConstantInt>(U)) && "Unexpected User.");
+ return true;
}
Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 9b40fc03da6b..e4ba70d1bce1 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -98,6 +98,9 @@ static cl::opt<bool> EnableUnswitchCostMultiplier(
static cl::opt<int> UnswitchSiblingsToplevelDiv(
"unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
cl::desc("Toplevel siblings divisor for cost multiplier."));
+static cl::opt<int> UnswitchParentBlocksDiv(
+ "unswitch-parent-blocks-div", cl::init(8), cl::Hidden,
+ cl::desc("Outer loop size divisor for cost multiplier."));
static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
"unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
cl::desc("Number of unswitch candidates that are ignored when calculating "
@@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
}
/// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
-/// candidates available. Also accounting for the number of "sibling" loops with
-/// the idea to account for previous unswitches that already happened on this
+/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
+/// candidates available. Also consider the number of "sibling" loops with
+/// the idea of accounting for previous unswitches that already happened on this
/// cluster of loops. There was an attempt to keep this formula simple,
/// just enough to limit the worst case behavior. Even if it is not that simple
/// now it is still not an attempt to provide a detailed heuristic size
@@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier(
return 1;
}
+ // Each invariant non-trivial condition, after being unswitched, is supposed
+ // to have its own specialized sibling loop (the invariant condition has been
+ // hoisted out of the child loop into a newly-cloned loop). When unswitching
+ // conditions in nested loops, the basic block size of the outer loop should
+ // not be altered. If such a size significantly increases across unswitching
+ // invocations, something may be wrong; so adjust the final cost taking this
+ // into account.
auto *ParentL = L.getParentLoop();
+ int ParentLoopSizeMultiplier = 1;
+ if (ParentL)
+ ParentLoopSizeMultiplier =
+ std::max<int>(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1);
+
int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
: std::distance(LI.begin(), LI.end()));
// Count amount of clones that all the candidates might cause during
@@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier(
// at an upper bound.
int CostMultiplier;
if (ClonesPower > Log2_32(UnswitchThreshold) ||
- SiblingsMultiplier > UnswitchThreshold)
+ SiblingsMultiplier > UnswitchThreshold ||
+ ParentLoopSizeMultiplier > UnswitchThreshold)
CostMultiplier = UnswitchThreshold;
else
CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
(int)UnswitchThreshold);
LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier
- << " (siblings " << SiblingsMultiplier << " * clones "
+ << " (siblings " << SiblingsMultiplier << " * parent size "
+ << ParentLoopSizeMultiplier << " * clones "
<< (1 << ClonesPower) << ")"
<< " for unswitch candidate: " << TI << "\n");
return CostMultiplier;
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index bb7dbc2980f5..e05625344ee2 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -997,7 +997,8 @@ void StructurizeCFG::simplifyHoistedPhis() {
continue;
OtherPhi->setIncomingValue(PoisonValBBIdx, V);
- Phi->setIncomingValue(i, OtherV);
+ if (DT->dominates(OtherV, Phi))
+ Phi->setIncomingValue(i, OtherV);
}
}
}
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index c76b3afef50c..27b13eeaf4d7 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -1285,7 +1285,7 @@ private:
// Cache misses on the merged chain
double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
double MergedSize = ChainPred->Size + ChainSucc->Size;
- double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
+ double MergedDensity = MergedCounts / MergedSize;
double NewScore = MergedCounts * missProbability(MergedDensity);
return CurScore - NewScore;
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 7063cde5263b..5a09b7385f2b 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -254,7 +254,6 @@ bool llvm::applyDebugifyMetadata(
}
if (ApplyToMF)
ApplyToMF(DIB, F);
- DIB.finalizeSubprogram(SP);
}
DIB.finalize();
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 3bbe875bbe9e..1a9e16be6989 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -13,6 +13,8 @@
#include "llvm/Transforms/Utils/FunctionImportUtils.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TimeProfiler.h"
+
using namespace llvm;
/// Uses the "source_filename" instead of a Module hash ID for the suffix of
@@ -370,6 +372,7 @@ void FunctionImportGlobalProcessing::run() { processGlobalsForThinLTO(); }
void llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
bool ClearDSOLocalOnDeclarations,
SetVector<GlobalValue *> *GlobalsToImport) {
+ llvm::TimeTraceScope timeScope("Rename module for ThinLTO");
FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport,
ClearDSOLocalOnDeclarations);
ThinLTOProcessing.run();
diff --git a/llvm/lib/Transforms/Utils/IRNormalizer.cpp b/llvm/lib/Transforms/Utils/IRNormalizer.cpp
index ad91318ae474..fefa49f68c8d 100644
--- a/llvm/lib/Transforms/Utils/IRNormalizer.cpp
+++ b/llvm/lib/Transforms/Utils/IRNormalizer.cpp
@@ -427,7 +427,7 @@ void IRNormalizer::reorderInstructions(Function &F) const {
// Process the remaining instructions.
//
// TODO: Do more a intelligent sorting of these instructions. For example,
- // seperate between dead instructinos and instructions used in another
+ // separate between dead instructinos and instructions used in another
// block. Use properties of the CFG the order instructions that are used
// in another block.
if (Visited.contains(&I))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ac344904f90f..2cfd70a1746c 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3397,8 +3397,8 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
if (FP && Ty.isFloatingPointTy() && Ty.getScalarSizeInBits() <= 64) {
const APFloat &APF = FP->getValueAPF();
APInt const &API = APF.bitcastToAPInt();
- if (auto Temp = API.getZExtValue())
- return DIB.createConstantValueExpression(static_cast<uint64_t>(Temp));
+ if (uint64_t Temp = API.getZExtValue())
+ return DIB.createConstantValueExpression(Temp);
return DIB.createConstantValueExpression(*API.getRawData());
}
@@ -3838,8 +3838,8 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
const auto *Op = I->getOperand(OpIdx);
- // We can't have a PHI with a metadata type.
- if (Op->getType()->isMetadataTy())
+ // We can't have a PHI with a metadata or token type.
+ if (Op->getType()->isMetadataTy() || Op->getType()->isTokenLikeTy())
return false;
// swifterror pointers can only be used by a load, store, or as a swifterror
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index ba0ac01cadd8..735bad1cb134 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -225,9 +225,9 @@ protected:
// Auxiliary function to calculate the number of iterations for a comparison
// instruction or a binary operator.
- PeelCounter mergeTwoCounter(const Instruction &CmpOrBinaryOp,
- const PeelCounterValue &LHS,
- const PeelCounterValue &RHS) const;
+ PeelCounter mergeTwoCounters(const Instruction &CmpOrBinaryOp,
+ const PeelCounterValue &LHS,
+ const PeelCounterValue &RHS) const;
// Returns true if the \p Phi is an induction in the target loop. This is a
// lightweight check and possible to detect an IV in some cases.
@@ -269,15 +269,13 @@ bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const {
break;
// Avoid infinite loop.
- if (Visited.contains(Cur))
+ if (!Visited.insert(Cur).second)
return false;
auto *I = dyn_cast<Instruction>(Cur);
if (!I || !L.contains(I))
return false;
- Visited.insert(Cur);
-
if (auto *Cast = dyn_cast<CastInst>(I)) {
Cur = Cast->getOperand(0);
} else if (auto *BinOp = dyn_cast<BinaryOperator>(I)) {
@@ -300,14 +298,14 @@ bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const {
/// When either \p LHS or \p RHS is an IV, the result of \p CmpOrBinaryOp is
/// considered an IV only if it is an addition or a subtraction. Otherwise the
-/// result can be a value that is neither an loop-invariant nor an IV.
+/// result can be a value that is neither a loop-invariant nor an IV.
///
/// If both \p LHS and \p RHS are loop-invariants, then the result of
/// \CmpOrBinaryOp is also a loop-invariant.
PhiAnalyzer::PeelCounter
-PhiAnalyzer::mergeTwoCounter(const Instruction &CmpOrBinaryOp,
- const PeelCounterValue &LHS,
- const PeelCounterValue &RHS) const {
+PhiAnalyzer::mergeTwoCounters(const Instruction &CmpOrBinaryOp,
+ const PeelCounterValue &LHS,
+ const PeelCounterValue &RHS) const {
auto &[LVal, LTy] = LHS;
auto &[RVal, RTy] = RHS;
unsigned NewVal = std::max(LVal, RVal);
@@ -380,7 +378,7 @@ PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) {
if (RHS == Unknown)
return Unknown;
return (IterationsToInvarianceOrInduction[I] =
- mergeTwoCounter(*I, *LHS, *RHS));
+ mergeTwoCounters(*I, *LHS, *RHS));
}
if (I->isCast())
// Cast instructions get the value of the operand.
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 86b268de43cf..b18aceaa67d7 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -41,6 +41,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -108,6 +109,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
#endif
);
+static cl::opt<bool> UnrollAddParallelReductions(
+ "unroll-add-parallel-reductions", cl::init(false), cl::Hidden,
+ cl::desc("Allow unrolling to add parallel reduction phis."));
/// Check if unrolling created a situation where we need to insert phi nodes to
/// preserve LCSSA form.
@@ -660,6 +664,41 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
OrigPHINode.push_back(cast<PHINode>(I));
}
+ // Collect phi nodes for reductions for which we can introduce multiple
+ // parallel reduction phis and compute the final reduction result after the
+ // loop. This requires a single exit block after unrolling. This is ensured by
+ // restricting to single-block loops where the unrolled iterations are known
+ // to not exit.
+ DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
+ bool CanAddAdditionalAccumulators =
+ (UnrollAddParallelReductions.getNumOccurrences() > 0
+ ? UnrollAddParallelReductions
+ : ULO.AddAdditionalAccumulators) &&
+ !CompletelyUnroll && L->getNumBlocks() == 1 &&
+ (ULO.Runtime ||
+ (ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 &&
+ ExitInfos[Header].BreakoutTrip == 0))));
+
+ // Limit parallelizing reductions to unroll counts of 4 or less for now.
+ // TODO: The number of parallel reductions should depend on the number of
+ // execution units. We also don't have to add a parallel reduction phi per
+ // unrolled iteration, but could for example add a parallel phi for every 2
+ // unrolled iterations.
+ if (CanAddAdditionalAccumulators && ULO.Count <= 4) {
+ for (PHINode &Phi : Header->phis()) {
+ auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE);
+ if (!RdxDesc)
+ continue;
+
+ // Only handle duplicate phis for a single reduction for now.
+ // TODO: Handle any number of reductions
+ if (!Reductions.empty())
+ continue;
+
+ Reductions[&Phi] = *RdxDesc;
+ }
+ }
+
std::vector<BasicBlock *> Headers;
std::vector<BasicBlock *> Latches;
Headers.push_back(Header);
@@ -710,6 +749,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
// latch. This is a reasonable default placement if we don't have block
// frequencies, and if we do, well the layout will be adjusted later.
auto BlockInsertPt = std::next(LatchBlock->getIterator());
+ SmallVector<Instruction *> PartialReductions;
for (unsigned It = 1; It != ULO.Count; ++It) {
SmallVector<BasicBlock *, 8> NewBlocks;
SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -733,6 +773,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
for (PHINode *OrigPHI : OrigPHINode) {
PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+
+ // Use cloned phis as parallel phis for partial reductions, which will
+ // get combined to the final reduction result after the loop.
+ if (Reductions.contains(OrigPHI)) {
+ // Collect partial reduction results.
+ if (PartialReductions.empty())
+ PartialReductions.push_back(cast<Instruction>(InVal));
+ PartialReductions.push_back(cast<Instruction>(VMap[InVal]));
+
+ // Update the start value for the cloned phis to use the identity
+ // value for the reduction.
+ const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI];
+ NewPHI->setIncomingValueForBlock(
+ L->getLoopPreheader(),
+ getRecurrenceIdentity(RdxDesc.getRecurrenceKind(),
+ OrigPHI->getType(),
+ RdxDesc.getFastMathFlags()));
+
+ // Update NewPHI to use the cloned value for the iteration and move
+ // to header.
+ NewPHI->replaceUsesOfWith(InVal, VMap[InVal]);
+ NewPHI->moveBefore(OrigPHI->getIterator());
+ continue;
+ }
+
if (Instruction *InValI = dyn_cast<Instruction>(InVal))
if (It > 1 && L->contains(InValI))
InVal = LastValueMap[InValI];
@@ -832,6 +897,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
PN->eraseFromParent();
} else if (ULO.Count > 1) {
+ if (Reductions.contains(PN))
+ continue;
+
Value *InVal = PN->removeIncomingValue(LatchBlock, false);
// If this value was defined in the loop, take the value defined by the
// last iteration of the loop.
@@ -1010,6 +1078,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
}
}
+ // If there are partial reductions, create code in the exit block to compute
+ // the final result and update users of the final result.
+ if (!PartialReductions.empty()) {
+ BasicBlock *ExitBlock = L->getExitBlock();
+ assert(ExitBlock &&
+ "Can only introduce parallel reduction phis with single exit block");
+ assert(Reductions.size() == 1 &&
+ "currently only a single reduction is supported");
+ Value *FinalRdxValue = PartialReductions.back();
+ Value *RdxResult = nullptr;
+ for (PHINode &Phi : ExitBlock->phis()) {
+ if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue)
+ continue;
+ if (!RdxResult) {
+ RdxResult = PartialReductions.front();
+ IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt());
+ RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
+ for (Instruction *RdxPart : drop_begin(PartialReductions)) {
+ RdxResult = Builder.CreateBinOp(
+ (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
+ RdxPart, RdxResult, "bin.rdx");
+ }
+ NeedToFixLCSSA = true;
+ for (Instruction *RdxPart : PartialReductions)
+ RdxPart->dropPoisonGeneratingFlags();
+ }
+
+ Phi.replaceAllUsesWith(RdxResult);
+ continue;
+ }
+ }
+
if (DTUToUse) {
// Apply updates to the DomTree.
DT = &DTU.getDomTree();
@@ -1111,3 +1211,41 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
}
return nullptr;
}
+
+std::optional<RecurrenceDescriptor>
+llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
+ ScalarEvolution *SE) {
+ RecurrenceDescriptor RdxDesc;
+ if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RdxDesc,
+ /*DemandedBits=*/nullptr,
+ /*AC=*/nullptr, /*DT=*/nullptr, SE))
+ return std::nullopt;
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ // Skip unsupported reductions.
+ // TODO: Handle additional reductions, including FP and min-max
+ // reductions.
+ if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
+ return std::nullopt;
+
+ if (RdxDesc.IntermediateStore)
+ return std::nullopt;
+
+ // Don't unroll reductions with constant ops; those can be folded to a
+ // single induction update.
+ if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch()))
+ ->operands(),
+ IsaPred<Constant>))
+ return std::nullopt;
+
+ BasicBlock *Latch = L->getLoopLatch();
+ if (!Latch ||
+ !is_contained(
+ cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(),
+ &Phi))
+ return std::nullopt;
+
+ return RdxDesc;
+}
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 843364eb34f8..b172ef6ba080 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2032,6 +2032,7 @@ Value *llvm::addRuntimeChecks(
MemoryRuntimeCheck = IsConflict;
}
+ Exp.eraseDeadInstructions(MemoryRuntimeCheck);
return MemoryRuntimeCheck;
}
@@ -2077,6 +2078,7 @@ Value *llvm::addDiffRuntimeChecks(
MemoryRuntimeCheck = IsConflict;
}
+ Expander.eraseDeadInstructions(MemoryRuntimeCheck);
return MemoryRuntimeCheck;
}
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 1711163fb9f5..ec2e6c1ab796 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -81,6 +81,8 @@ void LoopVersioning::versionLoop(
} else
RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck;
+ Exp.eraseDeadInstructions(SCEVRuntimeCheck);
+
assert(RuntimeCheck && "called even though we don't need "
"any runtime checks");
diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
index 41647f7717a4..faacd422c009 100644
--- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp
+++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
@@ -155,12 +155,15 @@ PreservedAnalyses ProfileVerifierPass::run(Function &F,
FunctionAnalysisManager &FAM) {
const auto EntryCount = F.getEntryCount(/*AllowSynthetic=*/true);
if (!EntryCount) {
- F.getContext().emitError("Profile verification failed: function entry "
- "count missing (set to 0 if cold)");
+ auto *MD = F.getMetadata(LLVMContext::MD_prof);
+ if (!MD || !isExplicitlyUnknownProfileMetadata(*MD)) {
+ F.getContext().emitError("Profile verification failed: function entry "
+ "count missing (set to 0 if cold)");
+ return PreservedAnalyses::all();
+ }
+ } else if (EntryCount->getCount() == 0) {
return PreservedAnalyses::all();
}
- if (EntryCount->getCount() == 0)
- return PreservedAnalyses::all();
for (const auto &BB : F) {
if (AnnotateSelect) {
for (const auto &I : BB)
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 10c162bc6463..d93a4d87f30f 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -849,9 +849,12 @@ void PromoteMem2Reg::run() {
for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
IncomingVals.init(i, UndefValue::get(Allocas[i]->getAllocatedType()));
- // When handling debug info, treat all incoming values as if they have unknown
- // locations until proven otherwise.
+ // When handling debug info, treat all incoming values as if they have
+ // compiler-generated (empty) locations, representing the uninitialized
+ // alloca, until proven otherwise.
IncomingLocs.resize(Allocas.size());
+ for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
+ IncomingLocs.init(i, DebugLoc::getCompilerGenerated());
// The renamer uses the Visited set to avoid infinite loops.
Visited.resize(F.getMaxBlockNumber(), false);
diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index d53a3144bf57..a814867652cd 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -21,29 +21,20 @@
using namespace llvm;
-static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
+struct LookupTableInfo {
+ Value *Index;
+ SmallVector<Constant *> Ptrs;
+};
+
+static bool shouldConvertToRelLookupTable(LookupTableInfo &Info, Module &M,
+ GlobalVariable &GV) {
// If lookup table has more than one user,
// do not generate a relative lookup table.
// This is to simplify the analysis that needs to be done for this pass.
// TODO: Add support for lookup tables with multiple uses.
// For ex, this can happen when a function that uses a lookup table gets
// inlined into multiple call sites.
- if (!GV.hasInitializer() ||
- !GV.isConstant() ||
- !GV.hasOneUse())
- return false;
-
- GetElementPtrInst *GEP =
- dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser());
- if (!GEP || !GEP->hasOneUse() ||
- GV.getValueType() != GEP->getSourceElementType())
- return false;
-
- LoadInst *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser());
- if (!Load || !Load->hasOneUse() ||
- Load->getType() != GEP->getResultElementType())
- return false;
-
+ //
// If the original lookup table does not have local linkage and is
// not dso_local, do not generate a relative lookup table.
// This optimization creates a relative lookup table that consists of
@@ -51,21 +42,40 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
// To be able to generate these offsets, relative lookup table and
// its elements should have internal linkage and be dso_local, which means
// that they should resolve to symbols within the same linkage unit.
- if (!GV.hasLocalLinkage() ||
- !GV.isDSOLocal() ||
- !GV.isImplicitDSOLocal())
+ if (!GV.hasInitializer() || !GV.isConstant() || !GV.hasOneUse() ||
+ !GV.hasLocalLinkage() || !GV.isDSOLocal() || !GV.isImplicitDSOLocal())
return false;
- ConstantArray *Array = dyn_cast<ConstantArray>(GV.getInitializer());
- if (!Array)
+ auto *GEP = dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser());
+ if (!GEP || !GEP->hasOneUse())
+ return false;
+
+ auto *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser());
+ if (!Load || !Load->hasOneUse())
return false;
// If values are not 64-bit pointers, do not generate a relative lookup table.
const DataLayout &DL = M.getDataLayout();
- Type *ElemType = Array->getType()->getElementType();
+ Type *ElemType = Load->getType();
if (!ElemType->isPointerTy() || DL.getPointerTypeSizeInBits(ElemType) != 64)
return false;
+ // Make sure this is a gep of the form GV + scale*var.
+ unsigned IndexWidth =
+ DL.getIndexTypeSizeInBits(Load->getPointerOperand()->getType());
+ SmallMapVector<Value *, APInt, 4> VarOffsets;
+ APInt ConstOffset(IndexWidth, 0);
+ if (!GEP->collectOffset(DL, IndexWidth, VarOffsets, ConstOffset) ||
+ !ConstOffset.isZero() || VarOffsets.size() != 1)
+ return false;
+
+ // This can't be a pointer lookup table if the stride is smaller than a
+ // pointer.
+ Info.Index = VarOffsets.front().first;
+ const APInt &Stride = VarOffsets.front().second;
+ if (Stride.ult(DL.getTypeStoreSize(ElemType)))
+ return false;
+
SmallVector<GlobalVariable *, 4> GVOps;
Triple TT = M.getTargetTriple();
// FIXME: This should be removed in the future.
@@ -80,14 +90,20 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
// https://github.com/rust-lang/rust/issues/141306.
|| (TT.isX86() && TT.isOSDarwin());
- for (const Use &Op : Array->operands()) {
- Constant *ConstOp = cast<Constant>(&Op);
+ APInt Offset(IndexWidth, 0);
+ uint64_t GVSize = DL.getTypeAllocSize(GV.getValueType());
+ for (; Offset.ult(GVSize); Offset += Stride) {
+ Constant *C =
+ ConstantFoldLoadFromConst(GV.getInitializer(), ElemType, Offset, DL);
+ if (!C)
+ return false;
+
GlobalValue *GVOp;
- APInt Offset;
+ APInt GVOffset;
// If an operand is not a constant offset from a lookup table,
// do not generate a relative lookup table.
- if (!IsConstantOffsetFromGlobal(ConstOp, GVOp, Offset, DL))
+ if (!IsConstantOffsetFromGlobal(C, GVOp, GVOffset, DL))
return false;
// If operand is mutable, do not generate a relative lookup table.
@@ -102,6 +118,8 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
if (ShouldDropUnnamedAddr)
GVOps.push_back(GlovalVarOp);
+
+ Info.Ptrs.push_back(C);
}
if (ShouldDropUnnamedAddr)
@@ -111,14 +129,12 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
return true;
}
-static GlobalVariable *createRelLookupTable(Function &Func,
+static GlobalVariable *createRelLookupTable(LookupTableInfo &Info,
+ Function &Func,
GlobalVariable &LookupTable) {
Module &M = *Func.getParent();
- ConstantArray *LookupTableArr =
- cast<ConstantArray>(LookupTable.getInitializer());
- unsigned NumElts = LookupTableArr->getType()->getNumElements();
ArrayType *IntArrayTy =
- ArrayType::get(Type::getInt32Ty(M.getContext()), NumElts);
+ ArrayType::get(Type::getInt32Ty(M.getContext()), Info.Ptrs.size());
GlobalVariable *RelLookupTable = new GlobalVariable(
M, IntArrayTy, LookupTable.isConstant(), LookupTable.getLinkage(),
@@ -127,10 +143,9 @@ static GlobalVariable *createRelLookupTable(Function &Func,
LookupTable.isExternallyInitialized());
uint64_t Idx = 0;
- SmallVector<Constant *, 64> RelLookupTableContents(NumElts);
+ SmallVector<Constant *, 64> RelLookupTableContents(Info.Ptrs.size());
- for (Use &Operand : LookupTableArr->operands()) {
- Constant *Element = cast<Constant>(Operand);
+ for (Constant *Element : Info.Ptrs) {
Type *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
Constant *Base = llvm::ConstantExpr::getPtrToInt(RelLookupTable, IntPtrTy);
Constant *Target = llvm::ConstantExpr::getPtrToInt(Element, IntPtrTy);
@@ -148,7 +163,8 @@ static GlobalVariable *createRelLookupTable(Function &Func,
return RelLookupTable;
}
-static void convertToRelLookupTable(GlobalVariable &LookupTable) {
+static void convertToRelLookupTable(LookupTableInfo &Info,
+ GlobalVariable &LookupTable) {
GetElementPtrInst *GEP =
cast<GetElementPtrInst>(LookupTable.use_begin()->getUser());
LoadInst *Load = cast<LoadInst>(GEP->use_begin()->getUser());
@@ -159,21 +175,21 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) {
Function &Func = *BB->getParent();
// Generate an array that consists of relative offsets.
- GlobalVariable *RelLookupTable = createRelLookupTable(Func, LookupTable);
+ GlobalVariable *RelLookupTable =
+ createRelLookupTable(Info, Func, LookupTable);
// Place new instruction sequence before GEP.
Builder.SetInsertPoint(GEP);
- Value *Index = GEP->getOperand(2);
- IntegerType *IntTy = cast<IntegerType>(Index->getType());
- Value *Offset =
- Builder.CreateShl(Index, ConstantInt::get(IntTy, 2), "reltable.shift");
+ IntegerType *IntTy = cast<IntegerType>(Info.Index->getType());
+ Value *Offset = Builder.CreateShl(Info.Index, ConstantInt::get(IntTy, 2),
+ "reltable.shift");
// Insert the call to load.relative intrinsic before LOAD.
// GEP might not be immediately followed by a LOAD, like it can be hoisted
// outside the loop or another instruction might be inserted them in between.
Builder.SetInsertPoint(Load);
Function *LoadRelIntrinsic = llvm::Intrinsic::getOrInsertDeclaration(
- &M, Intrinsic::load_relative, {Index->getType()});
+ &M, Intrinsic::load_relative, {Info.Index->getType()});
// Create a call to load.relative intrinsic that computes the target address
// by adding base address (lookup table address) and relative offset.
@@ -205,10 +221,11 @@ static bool convertToRelativeLookupTables(
bool Changed = false;
for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
- if (!shouldConvertToRelLookupTable(M, GV))
+ LookupTableInfo Info;
+ if (!shouldConvertToRelLookupTable(Info, M, GV))
continue;
- convertToRelLookupTable(GV);
+ convertToRelLookupTable(Info, GV);
// Remove the original lookup table.
GV.eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 060ca92e559a..28befd0aa1ce 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#if LLVM_ENABLE_ABI_BREAKING_CHECKS
@@ -175,6 +176,26 @@ SCEVExpander::findInsertPointAfter(Instruction *I,
return IP;
}
+void SCEVExpander::eraseDeadInstructions(Value *Root) {
+ SmallVector<Value *> WorkList;
+ SmallPtrSet<Value *, 8> DeletedValues;
+ append_range(WorkList, getAllInsertedInstructions());
+ while (!WorkList.empty()) {
+ Value *V = WorkList.pop_back_val();
+ if (DeletedValues.contains(V))
+ continue;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I || I == Root || !isInsertedInstruction(I) ||
+ !isInstructionTriviallyDead(I))
+ continue;
+ append_range(WorkList, I->operands());
+ InsertedValues.erase(I);
+ InsertedPostIncValues.erase(I);
+ DeletedValues.insert(I);
+ I->eraseFromParent();
+ }
+}
+
BasicBlock::iterator
SCEVExpander::GetOptimalInsertionPointForCastOf(Value *V) const {
// Cast the argument at the beginning of the entry block, after
@@ -1239,10 +1260,13 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
if (!isa<SCEVAddRecExpr>(ExitSCEV))
continue;
Type *PhiTy = PN.getType();
- if (STy->isIntegerTy() && PhiTy->isPointerTy())
+ if (STy->isIntegerTy() && PhiTy->isPointerTy()) {
ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy);
- else if (S->getType() != PN.getType())
+ if (isa<SCEVCouldNotCompute>(ExitSCEV))
+ continue;
+ } else if (S->getType() != PN.getType()) {
continue;
+ }
// Check if we can re-use the existing PN, by adjusting it with an expanded
// offset, if the offset is simpler.
@@ -2184,8 +2208,15 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
// negative. If Step is known to be positive or negative, only create
// either 1. or 2.
auto ComputeEndCheck = [&]() -> Value * {
- // Checking <u 0 is always false.
- if (!Signed && Start->isZero() && SE.isKnownPositive(Step))
+ // Checking <u 0 is always false, if (Step * trunc ExitCount) does not wrap.
+ // TODO: Predicates that can be proven true/false should be discarded when
+ // the predicates are created, not late during expansion.
+ if (!Signed && Start->isZero() && SE.isKnownPositive(Step) &&
+ DstBits < SrcBits &&
+ ExitCount == SE.getZeroExtendExpr(SE.getTruncateExpr(ExitCount, ARTy),
+ ExitCount->getType()) &&
+ SE.willNotOverflow(Instruction::Mul, Signed, Step,
+ SE.getTruncateExpr(ExitCount, ARTy)))
return ConstantInt::getFalse(Loc->getContext());
// Get the backedge taken count and truncate or extended to the AR type.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 7a538ae2c583..970f85378d3d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -612,6 +612,18 @@ private:
/// If CompValue is already set, the function is expected to fail if a match
/// is found but the value compared to is different.
bool matchInstruction(Instruction *I, bool isEQ) {
+ if (match(I, m_Not(m_Instruction(I))))
+ isEQ = !isEQ;
+
+ Value *Val;
+ if (match(I, m_NUWTrunc(m_Value(Val)))) {
+ // If we already have a value for the switch, it has to match!
+ if (!setValueOnce(Val))
+ return false;
+ UsedICmps++;
+ Vals.push_back(ConstantInt::get(cast<IntegerType>(Val->getType()), isEQ));
+ return true;
+ }
// If this is an icmp against a constant, handle this as one of the cases.
ICmpInst *ICI;
ConstantInt *C;
@@ -2260,10 +2272,6 @@ static bool canSinkInstructions(
for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
Value *Op = I0->getOperand(OI);
- if (Op->getType()->isTokenTy())
- // Don't touch any operand of token type.
- return false;
-
auto SameAsI0 = [&I0, OI](const Instruction *I) {
assert(I->getNumOperands() == I0->getNumOperands());
return I->getOperand(OI) == I0->getOperand(OI);
@@ -2764,8 +2772,7 @@ bool CompatibleSets::shouldBelongToSameSet(ArrayRef<InvokeInst *> Invokes) {
Use &U1 = std::get<1>(Ops);
if (U0 == U1)
return false;
- return U0->getType()->isTokenTy() ||
- !canReplaceOperandWithVariable(cast<Instruction>(U0.getUser()),
+ return !canReplaceOperandWithVariable(cast<Instruction>(U0.getUser()),
U0.getOperandNo());
};
assert(Invokes.size() == 2 && "Always called with exactly two candidates.");
@@ -4404,10 +4411,12 @@ static bool mergeConditionalStoreToAddress(
// OK, we're going to sink the stores to PostBB. The store has to be
// conditional though, so first create the predicate.
- Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
- ->getCondition();
- Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator())
- ->getCondition();
+ BranchInst *PBranch =
+ cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator());
+ BranchInst *QBranch =
+ cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator());
+ Value *PCond = PBranch->getCondition();
+ Value *QCond = QBranch->getCondition();
Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(),
PStore->getParent());
@@ -4418,13 +4427,11 @@ static bool mergeConditionalStoreToAddress(
IRBuilder<> QB(PostBB, PostBBFirst);
QB.SetCurrentDebugLocation(PostBBFirst->getStableDebugLoc());
- Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond);
- Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond);
+ InvertPCond ^= (PStore->getParent() != PTB);
+ InvertQCond ^= (QStore->getParent() != QTB);
+ Value *PPred = InvertPCond ? QB.CreateNot(PCond) : PCond;
+ Value *QPred = InvertQCond ? QB.CreateNot(QCond) : QCond;
- if (InvertPCond)
- PPred = QB.CreateNot(PPred);
- if (InvertQCond)
- QPred = QB.CreateNot(QPred);
Value *CombinedPred = QB.CreateOr(PPred, QPred);
BasicBlock::iterator InsertPt = QB.GetInsertPoint();
@@ -4808,23 +4815,12 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
SelectInst *NV = cast<SelectInst>(
Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
PN.setIncomingValue(PBBIdx, NV);
- // Although the select has the same condition as PBI, the original branch
- // weights for PBI do not apply to the new select because the select's
- // 'logical' edges are incoming edges of the phi that is eliminated, not
- // the outgoing edges of PBI.
+ // The select has the same condition as PBI, in the same BB. The
+ // probabilities don't change.
if (HasWeights) {
- uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
- uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
- uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
- uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
- // The weight to PredCommonDest should be PredCommon * SuccTotal.
- // The weight to PredOtherDest should be PredOther * SuccCommon.
- uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther),
- PredOther * SuccCommon};
-
- fitWeights(NewWeights);
-
- setBranchWeights(NV, NewWeights[0], NewWeights[1],
+ uint64_t TrueWeight = PBIOp ? PredFalseWeight : PredTrueWeight;
+ uint64_t FalseWeight = PBIOp ? PredTrueWeight : PredFalseWeight;
+ setBranchWeights(NV, TrueWeight, FalseWeight,
/*IsExpected=*/false);
}
}
@@ -6437,34 +6433,42 @@ static bool trySwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
namespace {
-/// This class represents a lookup table that can be used to replace a switch.
-class SwitchLookupTable {
+/// This class finds alternatives for switches to ultimately
+/// replace the switch.
+class SwitchReplacement {
public:
- /// Create a lookup table to use as a switch replacement with the contents
- /// of Values, using DefaultValue to fill any holes in the table.
- SwitchLookupTable(
+ /// Create a helper for optimizations to use as a switch replacement.
+ /// Find a better representation for the content of Values,
+ /// using DefaultValue to fill any holes in the table.
+ SwitchReplacement(
Module &M, uint64_t TableSize, ConstantInt *Offset,
const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName);
- /// Build instructions with Builder to retrieve the value at
- /// the position given by Index in the lookup table.
- Value *buildLookup(Value *Index, IRBuilder<> &Builder, const DataLayout &DL);
+ /// Build instructions with Builder to retrieve values using Index
+ /// and replace the switch.
+ Value *replaceSwitch(Value *Index, IRBuilder<> &Builder, const DataLayout &DL,
+ Function *Func);
/// Return true if a table with TableSize elements of
/// type ElementType would fit in a target-legal register.
static bool wouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
Type *ElementType);
+ /// Return the default value of the switch.
+ Constant *getDefaultValue();
+
+ /// Return true if the replacement is a lookup table.
+ bool isLookupTable();
+
private:
- // Depending on the contents of the table, it can be represented in
- // different ways.
+ // Depending on the switch, there are different alternatives.
enum {
- // For tables where each element contains the same value, we just have to
+ // For switches where each case contains the same value, we just have to
// store that single value and return it for each lookup.
SingleValueKind,
- // For tables where there is a linear relationship between table index
+ // For switches where there is a linear relationship between table index
// and values. We calculate the result with a simple multiplication
// and addition instead of a table lookup.
LinearMapKind,
@@ -6476,9 +6480,15 @@ private:
// The table is stored as an array of values. Values are retrieved by load
// instructions from the table.
- ArrayKind
+ LookupTableKind
} Kind;
+ // The default value of the switch.
+ Constant *DefaultValue;
+
+ // The type of the output values.
+ Type *ValueType;
+
// For SingleValueKind, this is the single value.
Constant *SingleValue = nullptr;
@@ -6491,23 +6501,24 @@ private:
ConstantInt *LinearMultiplier = nullptr;
bool LinearMapValWrapped = false;
- // For ArrayKind, this is the array.
- GlobalVariable *Array = nullptr;
+ // For LookupTableKind, this is the table.
+ Constant *Initializer = nullptr;
};
} // end anonymous namespace
-SwitchLookupTable::SwitchLookupTable(
+SwitchReplacement::SwitchReplacement(
Module &M, uint64_t TableSize, ConstantInt *Offset,
const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
- Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) {
+ Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName)
+ : DefaultValue(DefaultValue) {
assert(Values.size() && "Can't build lookup table without values!");
assert(TableSize >= Values.size() && "Can't fit values in table!");
// If all values in the table are equal, this is that value.
SingleValue = Values.begin()->second;
- Type *ValueType = Values.begin()->second->getType();
+ ValueType = Values.begin()->second->getType();
// Build up the table contents.
SmallVector<Constant *, 64> TableContents(TableSize);
@@ -6597,7 +6608,6 @@ SwitchLookupTable::SwitchLookupTable(
(void)M.smul_ov(APInt(M.getBitWidth(), TableSize - 1), MayWrap);
LinearMapValWrapped = NonMonotonic || MayWrap;
Kind = LinearMapKind;
- ++NumLinearMaps;
return;
}
}
@@ -6617,30 +6627,23 @@ SwitchLookupTable::SwitchLookupTable(
BitMap = ConstantInt::get(M.getContext(), TableInt);
BitMapElementTy = IT;
Kind = BitMapKind;
- ++NumBitMaps;
return;
}
// Store the table in an array.
- ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize);
- Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
-
- Array = new GlobalVariable(M, ArrayTy, /*isConstant=*/true,
- GlobalVariable::PrivateLinkage, Initializer,
- "switch.table." + FuncName);
- Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- // Set the alignment to that of an array items. We will be only loading one
- // value out of it.
- Array->setAlignment(DL.getPrefTypeAlign(ValueType));
- Kind = ArrayKind;
+ auto *TableTy = ArrayType::get(ValueType, TableSize);
+ Initializer = ConstantArray::get(TableTy, TableContents);
+
+ Kind = LookupTableKind;
}
-Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder,
- const DataLayout &DL) {
+Value *SwitchReplacement::replaceSwitch(Value *Index, IRBuilder<> &Builder,
+ const DataLayout &DL, Function *Func) {
switch (Kind) {
case SingleValueKind:
return SingleValue;
case LinearMapKind: {
+ ++NumLinearMaps;
// Derive the result value from the input value.
Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
false, "switch.idx.cast");
@@ -6656,6 +6659,7 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder,
return Result;
}
case BitMapKind: {
+ ++NumBitMaps;
// Type of the bitmap (e.g. i59).
IntegerType *MapTy = BitMap->getIntegerType();
@@ -6677,9 +6681,18 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder,
// Mask off.
return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked");
}
- case ArrayKind: {
- Type *IndexTy = DL.getIndexType(Array->getType());
- auto *ArrayTy = cast<ArrayType>(Array->getValueType());
+ case LookupTableKind: {
+ ++NumLookupTables;
+ auto *Table =
+ new GlobalVariable(*Func->getParent(), Initializer->getType(),
+ /*isConstant=*/true, GlobalVariable::PrivateLinkage,
+ Initializer, "switch.table." + Func->getName());
+ Table->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ // Set the alignment to that of an array items. We will be only loading one
+ // value out of it.
+ Table->setAlignment(DL.getPrefTypeAlign(ValueType));
+ Type *IndexTy = DL.getIndexType(Table->getType());
+ auto *ArrayTy = cast<ArrayType>(Table->getValueType());
if (Index->getType() != IndexTy) {
unsigned OldBitWidth = Index->getType()->getIntegerBitWidth();
@@ -6691,14 +6704,14 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder,
Value *GEPIndices[] = {ConstantInt::get(IndexTy, 0), Index};
Value *GEP =
- Builder.CreateInBoundsGEP(ArrayTy, Array, GEPIndices, "switch.gep");
+ Builder.CreateInBoundsGEP(ArrayTy, Table, GEPIndices, "switch.gep");
return Builder.CreateLoad(ArrayTy->getElementType(), GEP, "switch.load");
}
}
- llvm_unreachable("Unknown lookup table kind!");
+ llvm_unreachable("Unknown helper kind!");
}
-bool SwitchLookupTable::wouldFitInRegister(const DataLayout &DL,
+bool SwitchReplacement::wouldFitInRegister(const DataLayout &DL,
uint64_t TableSize,
Type *ElementType) {
auto *IT = dyn_cast<IntegerType>(ElementType);
@@ -6734,6 +6747,10 @@ static bool isTypeLegalForLookupTable(Type *Ty, const TargetTransformInfo &TTI,
DL.fitsInLegalInteger(IT->getBitWidth());
}
+Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; }
+
+bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; }
+
static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) {
// 40% is the default density for building a jump table in optsize/minsize
// mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this
@@ -6760,25 +6777,23 @@ static bool isSwitchDense(ArrayRef<int64_t> Values) {
// TODO: We could support larger than legal types by limiting based on the
// number of loads required and/or table size. If the constants are small we
// could use smaller table entries and extend after the load.
-static bool
-shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
- const TargetTransformInfo &TTI, const DataLayout &DL,
- const SmallDenseMap<PHINode *, Type *> &ResultTypes) {
+static bool shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL,
+ const SmallVector<Type *> &ResultTypes) {
if (SI->getNumCases() > TableSize)
return false; // TableSize overflowed.
bool AllTablesFitInRegister = true;
bool HasIllegalType = false;
- for (const auto &I : ResultTypes) {
- Type *Ty = I.second;
-
+ for (const auto &Ty : ResultTypes) {
// Saturate this flag to true.
HasIllegalType = HasIllegalType || !isTypeLegalForLookupTable(Ty, TTI, DL);
// Saturate this flag to false.
AllTablesFitInRegister =
AllTablesFitInRegister &&
- SwitchLookupTable::wouldFitInRegister(DL, TableSize, Ty);
+ SwitchReplacement::wouldFitInRegister(DL, TableSize, Ty);
// If both flags saturate, we're done. NOTE: This *only* works with
// saturating flags, and all flags have to saturate first due to the
@@ -6800,7 +6815,7 @@ shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
static bool shouldUseSwitchConditionAsTableIndex(
ConstantInt &MinCaseVal, const ConstantInt &MaxCaseVal,
- bool HasDefaultResults, const SmallDenseMap<PHINode *, Type *> &ResultTypes,
+ bool HasDefaultResults, const SmallVector<Type *> &ResultTypes,
const DataLayout &DL, const TargetTransformInfo &TTI) {
if (MinCaseVal.isNullValue())
return true;
@@ -6808,10 +6823,9 @@ static bool shouldUseSwitchConditionAsTableIndex(
MaxCaseVal.getLimitedValue() == std::numeric_limits<uint64_t>::max() ||
!HasDefaultResults)
return false;
- return all_of(ResultTypes, [&](const auto &KV) {
- return SwitchLookupTable::wouldFitInRegister(
- DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */,
- KV.second /* ResultType */);
+ return all_of(ResultTypes, [&](const auto &ResultType) {
+ return SwitchReplacement::wouldFitInRegister(
+ DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */, ResultType);
});
}
@@ -6900,18 +6914,13 @@ static void reuseTableCompare(
/// If the switch is only used to initialize one or more phi nodes in a common
/// successor block with different constant values, replace the switch with
/// lookup tables.
-static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
- DomTreeUpdater *DTU, const DataLayout &DL,
- const TargetTransformInfo &TTI) {
+static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
+ DomTreeUpdater *DTU, const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
assert(SI->getNumCases() > 1 && "Degenerate switch?");
BasicBlock *BB = SI->getParent();
Function *Fn = BB->getParent();
- // Only build lookup table when we have a target that supports it or the
- // attribute is not set.
- if (!TTI.shouldBuildLookupTables() ||
- (Fn->getFnAttribute("no-jump-tables").getValueAsBool()))
- return false;
// FIXME: If the switch is too sparse for a lookup table, perhaps we could
// split off a dense part and build a lookup table for that.
@@ -6938,7 +6947,7 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
SmallDenseMap<PHINode *, ResultListTy> ResultLists;
SmallDenseMap<PHINode *, Constant *> DefaultResults;
- SmallDenseMap<PHINode *, Type *> ResultTypes;
+ SmallVector<Type *> ResultTypes;
SmallVector<PHINode *, 4> PHIs;
for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
@@ -6955,7 +6964,8 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
Results, DL, TTI))
return false;
- // Append the result from this case to the list for each phi.
+ // Append the result and result types from this case to the list for each
+ // phi.
for (const auto &I : Results) {
PHINode *PHI = I.first;
Constant *Value = I.second;
@@ -6963,23 +6973,16 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
if (Inserted)
PHIs.push_back(PHI);
It->second.push_back(std::make_pair(CaseVal, Value));
+ ResultTypes.push_back(PHI->getType());
}
}
- // Keep track of the result types.
- for (PHINode *PHI : PHIs) {
- ResultTypes[PHI] = ResultLists[PHI][0].second->getType();
- }
-
- uint64_t NumResults = ResultLists[PHIs[0]].size();
-
// If the table has holes, we need a constant result for the default case
// or a bitmask that fits in a register.
SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
bool HasDefaultResults =
getCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest,
DefaultResultsList, DL, TTI);
-
for (const auto &I : DefaultResultsList) {
PHINode *PHI = I.first;
Constant *Result = I.second;
@@ -6989,15 +6992,21 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
bool UseSwitchConditionAsTableIndex = shouldUseSwitchConditionAsTableIndex(
*MinCaseVal, *MaxCaseVal, HasDefaultResults, ResultTypes, DL, TTI);
uint64_t TableSize;
- if (UseSwitchConditionAsTableIndex)
+ ConstantInt *TableIndexOffset;
+ if (UseSwitchConditionAsTableIndex) {
TableSize = MaxCaseVal->getLimitedValue() + 1;
- else
+ TableIndexOffset = ConstantInt::get(MaxCaseVal->getIntegerType(), 0);
+ } else {
TableSize =
(MaxCaseVal->getValue() - MinCaseVal->getValue()).getLimitedValue() + 1;
+ TableIndexOffset = MinCaseVal;
+ }
+
// If the default destination is unreachable, or if the lookup table covers
// all values of the conditional variable, branch directly to the lookup table
// BB. Otherwise, check that the condition is within the case range.
+ uint64_t NumResults = ResultLists[PHIs[0]].size();
bool DefaultIsReachable = !SI->defaultDestUnreachable();
bool TableHasHoles = (NumResults < TableSize);
@@ -7025,68 +7034,100 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
if (!shouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes))
return false;
- std::vector<DominatorTree::UpdateType> Updates;
-
- // Compute the maximum table size representable by the integer type we are
- // switching upon.
- unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
- uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize;
- assert(MaxTableSize >= TableSize &&
- "It is impossible for a switch to have more entries than the max "
- "representable value of its input integer type's size.");
-
- // Create the BB that does the lookups.
- Module &Mod = *CommonDest->getParent()->getParent();
- BasicBlock *LookupBB = BasicBlock::Create(
- Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
-
// Compute the table index value.
- Builder.SetInsertPoint(SI);
Value *TableIndex;
- ConstantInt *TableIndexOffset;
if (UseSwitchConditionAsTableIndex) {
- TableIndexOffset = ConstantInt::get(MaxCaseVal->getIntegerType(), 0);
TableIndex = SI->getCondition();
- } else {
- TableIndexOffset = MinCaseVal;
+ if (HasDefaultResults) {
+ // Grow the table to cover all possible index values to avoid the range
+ // check. It will use the default result to fill in the table hole later,
+ // so make sure it exist.
+ ConstantRange CR =
+ computeConstantRange(TableIndex, /* ForSigned */ false);
+ // Grow the table shouldn't have any size impact by checking
+ // wouldFitInRegister.
+ // TODO: Consider growing the table also when it doesn't fit in a register
+ // if no optsize is specified.
+ const uint64_t UpperBound = CR.getUpper().getLimitedValue();
+ if (!CR.isUpperWrapped() &&
+ all_of(ResultTypes, [&](const auto &ResultType) {
+ return SwitchReplacement::wouldFitInRegister(DL, UpperBound,
+ ResultType);
+ })) {
+ // There may be some case index larger than the UpperBound (unreachable
+ // case), so make sure the table size does not get smaller.
+ TableSize = std::max(UpperBound, TableSize);
+ // The default branch is unreachable after we enlarge the lookup table.
+ // Adjust DefaultIsReachable to reuse code path.
+ DefaultIsReachable = false;
+ }
+ }
+ }
+
+ // Keep track of the switch replacement for each phi
+ SmallDenseMap<PHINode *, SwitchReplacement> PhiToReplacementMap;
+ for (PHINode *PHI : PHIs) {
+ const auto &ResultList = ResultLists[PHI];
+
+ Type *ResultType = ResultList.begin()->second->getType();
+ // Use any value to fill the lookup table holes.
+ Constant *DefaultVal =
+ AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI];
+ StringRef FuncName = Fn->getName();
+ SwitchReplacement Replacement(*Fn->getParent(), TableSize, TableIndexOffset,
+ ResultList, DefaultVal, DL, FuncName);
+ PhiToReplacementMap.insert({PHI, Replacement});
+ }
+
+ bool AnyLookupTables = any_of(
+ PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); });
+
+ // A few conditions prevent the generation of lookup tables:
+ // 1. The target does not support lookup tables.
+ // 2. The "no-jump-tables" function attribute is set.
+ // However, these objections do not apply to other switch replacements, like
+ // the bitmap, so we only stop here if any of these conditions are met and we
+ // want to create a LUT. Otherwise, continue with the switch replacement.
+ if (AnyLookupTables &&
+ (!TTI.shouldBuildLookupTables() ||
+ Fn->getFnAttribute("no-jump-tables").getValueAsBool()))
+ return false;
+
+ Builder.SetInsertPoint(SI);
+ // TableIndex is the switch condition - TableIndexOffset if we don't
+ // use the condition directly
+ if (!UseSwitchConditionAsTableIndex) {
// If the default is unreachable, all case values are s>= MinCaseVal. Then
// we can try to attach nsw.
bool MayWrap = true;
if (!DefaultIsReachable) {
- APInt Res = MaxCaseVal->getValue().ssub_ov(MinCaseVal->getValue(), MayWrap);
+ APInt Res =
+ MaxCaseVal->getValue().ssub_ov(MinCaseVal->getValue(), MayWrap);
(void)Res;
}
-
TableIndex = Builder.CreateSub(SI->getCondition(), TableIndexOffset,
"switch.tableidx", /*HasNUW =*/false,
/*HasNSW =*/!MayWrap);
}
- BranchInst *RangeCheckBranch = nullptr;
+ std::vector<DominatorTree::UpdateType> Updates;
- // Grow the table to cover all possible index values to avoid the range check.
- // It will use the default result to fill in the table hole later, so make
- // sure it exist.
- if (UseSwitchConditionAsTableIndex && HasDefaultResults) {
- ConstantRange CR = computeConstantRange(TableIndex, /* ForSigned */ false);
- // Grow the table shouldn't have any size impact by checking
- // wouldFitInRegister.
- // TODO: Consider growing the table also when it doesn't fit in a register
- // if no optsize is specified.
- const uint64_t UpperBound = CR.getUpper().getLimitedValue();
- if (!CR.isUpperWrapped() && all_of(ResultTypes, [&](const auto &KV) {
- return SwitchLookupTable::wouldFitInRegister(
- DL, UpperBound, KV.second /* ResultType */);
- })) {
- // There may be some case index larger than the UpperBound (unreachable
- // case), so make sure the table size does not get smaller.
- TableSize = std::max(UpperBound, TableSize);
- // The default branch is unreachable after we enlarge the lookup table.
- // Adjust DefaultIsReachable to reuse code path.
- DefaultIsReachable = false;
- }
- }
+ // Compute the maximum table size representable by the integer type we are
+ // switching upon.
+ unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
+ uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize;
+ assert(MaxTableSize >= TableSize &&
+ "It is impossible for a switch to have more entries than the max "
+ "representable value of its input integer type's size.");
+
+ // Create the BB that does the lookups.
+ Module &Mod = *CommonDest->getParent()->getParent();
+ BasicBlock *LookupBB = BasicBlock::Create(
+ Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
+
+ BranchInst *RangeCheckBranch = nullptr;
+ Builder.SetInsertPoint(SI);
const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
Builder.CreateBr(LookupBB);
@@ -7157,25 +7198,16 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
for (PHINode *PHI : PHIs) {
const ResultListTy &ResultList = ResultLists[PHI];
-
- Type *ResultType = ResultList.begin()->second->getType();
-
- // Use any value to fill the lookup table holes.
- Constant *DV =
- AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI];
- StringRef FuncName = Fn->getName();
- SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV,
- DL, FuncName);
-
- Value *Result = Table.buildLookup(TableIndex, Builder, DL);
-
+ auto Replacement = PhiToReplacementMap.at(PHI);
+ auto *Result = Replacement.replaceSwitch(TableIndex, Builder, DL, Fn);
// Do a small peephole optimization: re-use the switch table compare if
// possible.
if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) {
BasicBlock *PhiBlock = PHI->getParent();
// Search for compare instructions which use the phi.
for (auto *User : PHI->users()) {
- reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList);
+ reuseTableCompare(User, PhiBlock, RangeCheckBranch,
+ Replacement.getDefaultValue(), ResultList);
}
}
@@ -7202,7 +7234,6 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
if (DTU)
DTU->applyUpdates(Updates);
- ++NumLookupTables;
if (NeedMask)
++NumLookupTablesHoles;
return true;
@@ -7708,7 +7739,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
// CVP. Therefore, only apply this transformation during late stages of the
// optimisation pipeline.
if (Options.ConvertSwitchToLookupTable &&
- switchToLookupTable(SI, Builder, DTU, DL, TTI))
+ simplifySwitchLookup(SI, Builder, DTU, DL, TTI))
return requestResimplify();
if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI))
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 2d6a748f4507..8acebbaa5458 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -97,6 +97,10 @@ static cl::opt<unsigned, false, HotColdHintParser>
static cl::opt<unsigned, false, HotColdHintParser> HotNewHintValue(
"hot-new-hint-value", cl::Hidden, cl::init(254),
cl::desc("Value to pass to hot/cold operator new for hot allocation"));
+static cl::opt<unsigned, false, HotColdHintParser> AmbiguousNewHintValue(
+ "ambiguous-new-hint-value", cl::Hidden, cl::init(222),
+ cl::desc(
+ "Value to pass to hot/cold operator new for ambiguous allocation"));
//===----------------------------------------------------------------------===//
// Helper Functions
@@ -1719,6 +1723,37 @@ Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
return nullptr;
}
+// Allow existing calls to operator new() that takes a __hot_cold_t parameter to
+// be updated with a compiler-determined hot cold hint value. This is used in
+// cases where the call is marked nobuiltin (because operator new called
+// explicitly) and therefore cannot be replaced with a different callee.
+Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI,
+ IRBuilderBase &B) {
+ if (!OptimizeHotColdNew || !OptimizeExistingHotColdNew)
+ return nullptr;
+ Function *Callee = CI->getCalledFunction();
+ if (!Callee)
+ return nullptr;
+ LibFunc Func;
+ if (!TLI->getLibFunc(*Callee, Func))
+ return nullptr;
+ switch (Func) {
+ case LibFunc_Znwm12__hot_cold_t:
+ case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
+ case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
+ case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+ case LibFunc_Znam12__hot_cold_t:
+ case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
+ case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
+ case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+ case LibFunc_size_returning_new_hot_cold:
+ case LibFunc_size_returning_new_aligned_hot_cold:
+ return optimizeNew(CI, B, Func);
+ default:
+ return nullptr;
+ }
+}
+
// When enabled, replace operator new() calls marked with a hot or cold memprof
// attribute with an operator new() call that takes a __hot_cold_t parameter.
// Currently this is supported by the open source version of tcmalloc, see:
@@ -1736,6 +1771,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
HotCold = NotColdNewHintValue;
else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() == "hot")
HotCold = HotNewHintValue;
+ else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() ==
+ "ambiguous")
+ HotCold = AmbiguousNewHintValue;
else
return nullptr;
@@ -1753,9 +1791,8 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
LibFunc_Znwm12__hot_cold_t, HotCold);
break;
case LibFunc_Znwm:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdNew(CI->getArgOperand(0), B, TLI,
- LibFunc_Znwm12__hot_cold_t, HotCold);
+ return emitHotColdNew(CI->getArgOperand(0), B, TLI,
+ LibFunc_Znwm12__hot_cold_t, HotCold);
break;
case LibFunc_Znam12__hot_cold_t:
if (OptimizeExistingHotColdNew)
@@ -1763,9 +1800,8 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
LibFunc_Znam12__hot_cold_t, HotCold);
break;
case LibFunc_Znam:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdNew(CI->getArgOperand(0), B, TLI,
- LibFunc_Znam12__hot_cold_t, HotCold);
+ return emitHotColdNew(CI->getArgOperand(0), B, TLI,
+ LibFunc_Znam12__hot_cold_t, HotCold);
break;
case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
if (OptimizeExistingHotColdNew)
@@ -1774,10 +1810,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold);
break;
case LibFunc_ZnwmRKSt9nothrow_t:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdNewNoThrow(
- CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
- LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold);
+ return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B,
+ TLI, LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t,
+ HotCold);
break;
case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
if (OptimizeExistingHotColdNew)
@@ -1786,10 +1821,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold);
break;
case LibFunc_ZnamRKSt9nothrow_t:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdNewNoThrow(
- CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
- LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold);
+ return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B,
+ TLI, LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t,
+ HotCold);
break;
case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
if (OptimizeExistingHotColdNew)
@@ -1798,10 +1832,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold);
break;
case LibFunc_ZnwmSt11align_val_t:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdNewAligned(
- CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
- LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold);
+ return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B,
+ TLI, LibFunc_ZnwmSt11align_val_t12__hot_cold_t,
+ HotCold);
break;
case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
if (OptimizeExistingHotColdNew)
@@ -1810,10 +1843,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold);
break;
case LibFunc_ZnamSt11align_val_t:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdNewAligned(
- CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
- LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold);
+ return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B,
+ TLI, LibFunc_ZnamSt11align_val_t12__hot_cold_t,
+ HotCold);
break;
case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
if (OptimizeExistingHotColdNew)
@@ -1823,11 +1855,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
HotCold);
break;
case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdNewAlignedNoThrow(
- CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
- TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t,
- HotCold);
+ return emitHotColdNewAlignedNoThrow(
+ CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
+ TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold);
break;
case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
if (OptimizeExistingHotColdNew)
@@ -1837,17 +1867,14 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
HotCold);
break;
case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdNewAlignedNoThrow(
- CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
- TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t,
- HotCold);
+ return emitHotColdNewAlignedNoThrow(
+ CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
+ TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold);
break;
case LibFunc_size_returning_new:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdSizeReturningNew(CI->getArgOperand(0), B, TLI,
- LibFunc_size_returning_new_hot_cold,
- HotCold);
+ return emitHotColdSizeReturningNew(CI->getArgOperand(0), B, TLI,
+ LibFunc_size_returning_new_hot_cold,
+ HotCold);
break;
case LibFunc_size_returning_new_hot_cold:
if (OptimizeExistingHotColdNew)
@@ -1856,10 +1883,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
HotCold);
break;
case LibFunc_size_returning_new_aligned:
- if (HotCold != NotColdNewHintValue)
- return emitHotColdSizeReturningNewAligned(
- CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
- LibFunc_size_returning_new_aligned_hot_cold, HotCold);
+ return emitHotColdSizeReturningNewAligned(
+ CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+ LibFunc_size_returning_new_aligned_hot_cold, HotCold);
break;
case LibFunc_size_returning_new_aligned_hot_cold:
if (OptimizeExistingHotColdNew)
@@ -4094,8 +4120,11 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
// TODO: Split out the code below that operates on FP calls so that
// we can all non-FP calls with the StrictFP attribute to be
// optimized.
- if (CI->isNoBuiltin())
- return nullptr;
+ if (CI->isNoBuiltin()) {
+ // If this is an existing call to a hot cold operator new, we can update the
+ // hint parameter value, which doesn't change the callee.
+ return optimizeExistingHotColdNew(CI, Builder);
+ }
LibFunc Func;
Function *Callee = CI->getCalledFunction();
diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index d52d52a9b7d3..6319fd524ff0 100644
--- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -349,13 +349,7 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
KeyValue = Key->getValue(KeyStorage);
if (KeyValue == "source") {
- std::string Error;
-
Source = std::string(Value->getValue(ValueStorage));
- if (!Regex(Source).isValid(Error)) {
- YS.printError(Field.getKey(), "invalid regex: " + Error);
- return false;
- }
} else if (KeyValue == "target") {
Target = std::string(Value->getValue(ValueStorage));
} else if (KeyValue == "transform") {
@@ -379,12 +373,22 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
// TODO see if there is a more elegant solution to selecting the rewrite
// descriptor type
- if (!Target.empty())
+ if (!Target.empty()) {
DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>(
Source, Target, Naked));
- else
- DL->push_back(
- std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
+ return true;
+ }
+
+ {
+ std::string Error;
+ if (!Regex(Source).isValid(Error)) {
+ YS.printError(Descriptor, "invalid Source regex: " + Error);
+ return false;
+ }
+ }
+
+ DL->push_back(
+ std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
return true;
}
@@ -418,13 +422,7 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
KeyValue = Key->getValue(KeyStorage);
if (KeyValue == "source") {
- std::string Error;
-
Source = std::string(Value->getValue(ValueStorage));
- if (!Regex(Source).isValid(Error)) {
- YS.printError(Field.getKey(), "invalid regex: " + Error);
- return false;
- }
} else if (KeyValue == "target") {
Target = std::string(Value->getValue(ValueStorage));
} else if (KeyValue == "transform") {
@@ -441,13 +439,23 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
return false;
}
- if (!Target.empty())
+ if (!Target.empty()) {
DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
Source, Target,
/*Naked*/ false));
- else
- DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>(
- Source, Transform));
+ return true;
+ }
+
+ {
+ std::string Error;
+ if (!Regex(Source).isValid(Error)) {
+ YS.printError(Descriptor, "invalid Source regex: " + Error);
+ return false;
+ }
+ }
+
+ DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>(
+ Source, Transform));
return true;
}
@@ -481,13 +489,7 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
KeyValue = Key->getValue(KeyStorage);
if (KeyValue == "source") {
- std::string Error;
-
Source = std::string(Value->getValue(ValueStorage));
- if (!Regex(Source).isValid(Error)) {
- YS.printError(Field.getKey(), "invalid regex: " + Error);
- return false;
- }
} else if (KeyValue == "target") {
Target = std::string(Value->getValue(ValueStorage));
} else if (KeyValue == "transform") {
@@ -504,13 +506,23 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
return false;
}
- if (!Target.empty())
+ if (!Target.empty()) {
DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>(
Source, Target,
/*Naked*/ false));
- else
- DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>(
- Source, Transform));
+ return true;
+ }
+
+ {
+ std::string Error;
+ if (!Regex(Source).isValid(Error)) {
+ YS.printError(Descriptor, "invalid Source regex: " + Error);
+ return false;
+ }
+ }
+
+ DL->push_back(
+ std::make_unique<PatternRewriteNamedAliasDescriptor>(Source, Transform));
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 491f0b76f4ae..53129e2e5fbb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -170,10 +170,10 @@ private:
bool recognizeFindFirstByte();
Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
- unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
- BasicBlock *ExitFail, Value *SearchStart,
- Value *SearchEnd, Value *NeedleStart,
- Value *NeedleEnd);
+ unsigned VF, Type *CharTy, Value *IndPhi,
+ BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ Value *SearchStart, Value *SearchEnd,
+ Value *NeedleStart, Value *NeedleEnd);
void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy,
BasicBlock *ExitSucc, BasicBlock *ExitFail,
@@ -242,6 +242,37 @@ bool LoopIdiomVectorize::run(Loop *L) {
return false;
}
+static void fixSuccessorPhis(Loop *L, Value *ScalarRes, Value *VectorRes,
+ BasicBlock *SuccBB, BasicBlock *IncBB) {
+ for (PHINode &PN : SuccBB->phis()) {
+ // Look through the incoming values to find ScalarRes, meaning this is a
+ // PHI collecting the results of the transformation.
+ bool ResPhi = false;
+ for (Value *Op : PN.incoming_values())
+ if (Op == ScalarRes) {
+ ResPhi = true;
+ break;
+ }
+
+ // Any PHI that depended upon the result of the transformation needs a new
+ // incoming value from IncBB.
+ if (ResPhi)
+ PN.addIncoming(VectorRes, IncBB);
+ else {
+ // There should be no other outside uses of other values in the
+ // original loop. Any incoming values should either:
+ // 1. Be for blocks outside the loop, which aren't interesting. Or ..
+ // 2. These are from blocks in the loop with values defined outside
+ // the loop. We should a similar incoming value from CmpBB.
+ for (BasicBlock *BB : PN.blocks())
+ if (L->contains(BB)) {
+ PN.addIncoming(PN.getIncomingValueForBlock(BB), IncBB);
+ break;
+ }
+ }
+ }
+}
+
bool LoopIdiomVectorize::recognizeByteCompare() {
// Currently the transformation only works on scalable vector types, although
// there is no fundamental reason why it cannot be made to work for fixed
@@ -574,13 +605,8 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
{VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
- StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
- auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
- Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
- Value *VectorMatchCmp = Builder.CreateIntrinsic(
- Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
- {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
- "mismatch.cmp");
+ Value *VectorMatchCmp =
+ Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad, "mismatch.cmp");
Value *CTZ = Builder.CreateIntrinsic(
Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
{VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask,
@@ -940,42 +966,10 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
}
- auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {
- for (PHINode &PN : SuccBB->phis()) {
- // At this point we've already replaced all uses of the result from the
- // loop with ByteCmp. Look through the incoming values to find ByteCmp,
- // meaning this is a Phi collecting the results of the byte compare.
- bool ResPhi = false;
- for (Value *Op : PN.incoming_values())
- if (Op == ByteCmpRes) {
- ResPhi = true;
- break;
- }
-
- // Any PHI that depended upon the result of the byte compare needs a new
- // incoming value from CmpBB. This is because the original loop will get
- // deleted.
- if (ResPhi)
- PN.addIncoming(ByteCmpRes, CmpBB);
- else {
- // There should be no other outside uses of other values in the
- // original loop. Any incoming values should either:
- // 1. Be for blocks outside the loop, which aren't interesting. Or ..
- // 2. These are from blocks in the loop with values defined outside
- // the loop. We should a similar incoming value from CmpBB.
- for (BasicBlock *BB : PN.blocks())
- if (CurLoop->contains(BB)) {
- PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB);
- break;
- }
- }
- }
- };
-
// Ensure all Phis in the successors of CmpBB have an incoming value from it.
- fixSuccessorPhis(EndBB);
+ fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, EndBB, CmpBB);
if (EndBB != FoundBB)
- fixSuccessorPhis(FoundBB);
+ fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, FoundBB, CmpBB);
// The new CmpBB block isn't part of the loop, but will need to be added to
// the outer loop if there is one.
@@ -1173,8 +1167,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
Value *LoopIdiomVectorize::expandFindFirstByte(
IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy,
- BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart,
- Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) {
+ Value *IndPhi, BasicBlock *ExitSucc, BasicBlock *ExitFail,
+ Value *SearchStart, Value *SearchEnd, Value *NeedleStart,
+ Value *NeedleEnd) {
// Set up some types and constants that we intend to reuse.
auto *PtrTy = Builder.getPtrTy();
auto *I64Ty = Builder.getInt64Ty();
@@ -1374,6 +1369,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
MatchLCSSA->addIncoming(Search, BB2);
MatchPredLCSSA->addIncoming(MatchPred, BB2);
+ // Ensure all Phis in the successors of BB3/BB5 have an incoming value from
+ // them.
+ fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitSucc, BB3);
+ if (ExitSucc != ExitFail)
+ fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitFail, BB5);
+
if (VerifyLoops) {
OuterLoop->verifyLoop();
InnerLoop->verifyLoop();
@@ -1395,21 +1396,12 @@ void LoopIdiomVectorize::transformFindFirstByte(
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
- Value *MatchVal =
- expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail,
- SearchStart, SearchEnd, NeedleStart, NeedleEnd);
+ expandFindFirstByte(Builder, DTU, VF, CharTy, IndPhi, ExitSucc, ExitFail,
+ SearchStart, SearchEnd, NeedleStart, NeedleEnd);
assert(PHBranch->isUnconditional() &&
"Expected preheader to terminate with an unconditional branch.");
- // Add new incoming values with the result of the transformation to PHINodes
- // of ExitSucc that use IndPhi.
- for (auto *U : llvm::make_early_inc_range(IndPhi->users())) {
- auto *PN = dyn_cast<PHINode>(U);
- if (PN && PN->getParent() == ExitSucc)
- PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
- }
-
if (VerifyLoops && CurLoop->getParentLoop()) {
CurLoop->getParentLoop()->verifyLoop();
if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 789047a2a28e..2704e66f3a70 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -15,8 +15,10 @@
//
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
});
}
- if (!LAI->canVectorizeMemory())
+ if (!LAI->canVectorizeMemory()) {
+ if (hasUncountableExitWithSideEffects()) {
+ reportVectorizationFailure(
+ "Cannot vectorize unsafe dependencies in uncountable exit loop with "
+ "side effects",
+ "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
+ TheLoop);
+ return false;
+ }
+
return canVectorizeIndirectUnsafeDependences();
+ }
if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
reportVectorizationFailure("We don't allow storing to uniform addresses",
@@ -1530,7 +1542,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!isGuaranteedNotToBePoison(CurrV, AC,
TheLoop->getLoopPredecessor()
->getTerminator()
- ->getIterator()))
+ ->getIterator(),
+ DT))
return false;
continue;
}
@@ -1754,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
}
};
+ bool HasSideEffects = false;
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
if (I.mayWriteToMemory()) {
- // We don't support writes to memory.
+ if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) {
+ HasSideEffects = true;
+ continue;
+ }
+
+ // We don't support complex writes to memory.
reportVectorizationFailure(
- "Writes to memory unsupported in early exit loops",
- "Cannot vectorize early exit loop with writes to memory",
+ "Complex writes to memory unsupported in early exit loops",
+ "Cannot vectorize early exit loop with complex writes to memory",
"WritesInEarlyExitLoop", ORE, TheLoop);
return false;
- } else if (!IsSafeOperation(&I)) {
+ }
+
+ if (!IsSafeOperation(&I)) {
reportVectorizationFailure("Early exit loop contains operations that "
"cannot be speculatively executed",
"UnsafeOperationsEarlyExitLoop", ORE,
@@ -1776,15 +1797,37 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock &&
"Expected latch predecessor to be the early exiting block");
+ SmallVector<LoadInst *, 4> NonDerefLoads;
// TODO: Handle loops that may fault.
- Predicates.clear();
- if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
- &Predicates)) {
- reportVectorizationFailure(
- "Loop may fault",
- "Cannot vectorize potentially faulting early exit loop",
- "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ if (!HasSideEffects) {
+ // Read-only loop.
+ Predicates.clear();
+ if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads,
+ &Predicates)) {
+ reportVectorizationFailure(
+ "Loop may fault", "Cannot vectorize non-read-only early exit loop",
+ "NonReadOnlyEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+ } else if (!canUncountableExitConditionLoadBeMoved(
+ SingleUncountableExitingBlock))
return false;
+
+ // Check non-dereferenceable loads if any.
+ for (LoadInst *LI : NonDerefLoads) {
+ // Only support unit-stride access for now.
+ int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand());
+ if (Stride != 1) {
+ reportVectorizationFailure(
+ "Loop contains potentially faulting strided load",
+ "Cannot vectorize early exit loop with "
+ "strided fault-only-first load",
+ "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop);
+ return false;
+ }
+ PotentiallyFaultingLoads.insert(LI);
+ LLVM_DEBUG(dbgs() << "LV: Found potentially faulting load: " << *LI
+ << "\n");
}
[[maybe_unused]] const SCEV *SymbolicMaxBTC =
@@ -1797,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
"backedge taken count: "
<< *SymbolicMaxBTC << '\n');
UncountableExitingBB = SingleUncountableExitingBlock;
+ UncountableExitWithSideEffects = HasSideEffects;
+ return true;
+}
+
+bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
+ BasicBlock *ExitingBlock) {
+ // Try to find a load in the critical path for the uncountable exit condition.
+ // This is currently matching about the simplest form we can, expecting
+ // only one in-loop load, the result of which is directly compared against
+ // a loop-invariant value.
+ // FIXME: We're insisting on a single use for now, because otherwise we will
+ // need to make PHI nodes for other users. That can be done once the initial
+ // transform code lands.
+ auto *Br = cast<BranchInst>(ExitingBlock->getTerminator());
+
+ using namespace llvm::PatternMatch;
+ Instruction *L = nullptr;
+ Value *Ptr = nullptr;
+ Value *R = nullptr;
+ if (!match(Br->getCondition(),
+ m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))),
+ m_Value(R))))) {
+ reportVectorizationFailure(
+ "Early exit loop with store but no supported condition load",
+ "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
+ // FIXME: Don't rely on operand ordering for the comparison.
+ if (!TheLoop->isLoopInvariant(R)) {
+ reportVectorizationFailure(
+ "Early exit loop with store but no supported condition load",
+ "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
+ // Make sure that the load address is not loop invariant; we want an
+ // address calculation that we can rotate to the next vector iteration.
+ const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
+ if (!isa<SCEVAddRecExpr>(PtrScev)) {
+ reportVectorizationFailure(
+ "Uncountable exit condition depends on load with an address that is "
+ "not an add recurrence",
+ "EarlyExitLoadInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+
+ // FIXME: Support gathers after first-faulting load support lands.
+ SmallVector<const SCEVPredicate *, 4> Predicates;
+ LoadInst *Load = cast<LoadInst>(L);
+ if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC,
+ &Predicates)) {
+ reportVectorizationFailure(
+ "Loop may fault",
+ "Cannot vectorize potentially faulting early exit loop",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
+ ICFLoopSafetyInfo SafetyInfo;
+ SafetyInfo.computeLoopSafetyInfo(TheLoop);
+ // We need to know that load will be executed before we can hoist a
+ // copy out to run just before the first iteration.
+ // FIXME: Currently, other restrictions prevent us from reaching this point
+ // with a loop where the uncountable exit condition is determined
+ // by a conditional load.
+ assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) &&
+ "Unhandled control flow in uncountable exit loop with side effects");
+
+ // Prohibit any potential aliasing with any instruction in the loop which
+ // might store to memory.
+ // FIXME: Relax this constraint where possible.
+ for (auto *BB : TheLoop->blocks()) {
+ for (auto &I : *BB) {
+ if (&I == Load)
+ continue;
+
+ if (I.mayWriteToMemory()) {
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ AliasResult AR = AA->alias(Ptr, SI->getPointerOperand());
+ if (AR == AliasResult::NoAlias)
+ continue;
+ }
+
+ reportVectorizationFailure(
+ "Cannot determine whether critical uncountable exit load address "
+ "does not alias with a memory write",
+ "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -1869,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
} else {
if (!isVectorizableEarlyExitLoop()) {
assert(!hasUncountableEarlyExit() &&
+ !hasUncountableExitWithSideEffects() &&
"Must be false without vectorizable early-exit loop");
if (DoExtraAnalysis)
Result = false;
@@ -1887,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
return false;
}
+ // Bail out for state-changing loops with uncountable exits for now.
+ if (UncountableExitWithSideEffects) {
+ reportVectorizationFailure(
+ "Writes to memory unsupported in early exit loops",
+ "Cannot vectorize early exit loop with writes to memory",
+ "WritesInEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
if (Result) {
LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerChecking()->Need
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 838476dcae66..d34d2ae7a0b3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -334,6 +334,10 @@ public:
FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL));
}
+ VPExpandSCEVRecipe *createExpandSCEV(const SCEV *Expr) {
+ return tryInsertInstruction(new VPExpandSCEVRecipe(Expr));
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
@@ -559,6 +563,20 @@ public:
/// Emit remarks for recipes with invalid costs in the available VPlans.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
+ /// Create a check to \p Plan to see if the vector loop should be executed
+ /// based on its trip count.
+ void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const;
+
+ /// Update loop metadata and profile info for both the scalar remainder loop
+ /// and \p VectorLoop, if it exists. Keeps all loop hints from the original
+ /// loop on the vector loop and replaces vectorizer-specific metadata.
+ void updateLoopMetadataAndProfileInfo(Loop *VectorLoop,
+ VPBasicBlock *HeaderVPBB,
+ bool VectorizingEpilogue,
+ unsigned EstimatedVFxUF,
+ bool DisableRuntimeUnroll);
+
protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
@@ -613,13 +631,15 @@ private:
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B.
bool isMoreProfitable(const VectorizationFactor &A,
- const VectorizationFactor &B, bool HasTail) const;
+ const VectorizationFactor &B, bool HasTail,
+ bool IsEpilogue = false) const;
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B in the context of vectorizing a loop with known \p MaxTripCount.
bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
- const unsigned MaxTripCount, bool HasTail) const;
+ const unsigned MaxTripCount, bool HasTail,
+ bool IsEpilogue = false) const;
/// Determines if we have the infrastructure to vectorize the loop and its
/// epilogue, assuming the main loop is vectorized by \p VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a0f306c12754..3cff43a51029 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -165,15 +165,6 @@ using namespace SCEVPatternMatch;
const char VerboseDebug[] = DEBUG_TYPE "-verbose";
#endif
-/// @{
-/// Metadata attribute names
-const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
-const char LLVMLoopVectorizeFollowupVectorized[] =
- "llvm.loop.vectorize.followup_vectorized";
-const char LLVMLoopVectorizeFollowupEpilogue[] =
- "llvm.loop.vectorize.followup_epilogue";
-/// @}
-
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
@@ -500,26 +491,22 @@ public:
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
LoopInfo *LI, DominatorTree *DT,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- ElementCount VecWidth,
- ElementCount MinProfitableTripCount,
- unsigned UnrollFactor, LoopVectorizationCostModel *CM,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &RTChecks, VPlan &Plan)
+ ElementCount VecWidth, unsigned UnrollFactor,
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ VPlan &Plan)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
- VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount),
- UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM),
- BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+ VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
+ Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
VectorPHVPBB(cast<VPBasicBlock>(
Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
virtual ~InnerLoopVectorizer() = default;
- /// Create a new empty loop that will contain vectorized instructions later
- /// on, while the old loop will be used as the scalar remainder. Control flow
- /// is generated around the vectorized (and scalar epilogue) loops consisting
- /// of various checks and bypasses. Return the pre-header block of the new
- /// loop. In the case of epilogue vectorization, this function is overriden to
- /// handle the more complex control flow around the loops.
+ /// Creates a basic block for the scalar preheader. Both
+ /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
+ /// the method to create additional blocks and checks needed for epilogue
+ /// vectorization.
virtual BasicBlock *createVectorizedLoopSkeleton();
/// Fix the vectorized code, taking care of header phi's, and more.
@@ -536,38 +523,18 @@ public:
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
- /// Return the additional bypass block which targets the scalar loop by
- /// skipping the epilogue loop after completing the main loop.
- BasicBlock *getAdditionalBypassBlock() const {
- assert(AdditionalBypassBlock &&
- "Trying to access AdditionalBypassBlock but it has not been set");
- return AdditionalBypassBlock;
- }
-
protected:
friend class LoopVectorizationPlanner;
- // Create a check to see if the vector loop should be executed
- Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
-
- /// Emit a bypass check to see if the vector trip count is zero, including if
- /// it overflows.
- void emitIterationCountCheck(BasicBlock *Bypass);
-
- /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
- /// vector loop preheader, middle block and scalar preheader.
- void createVectorLoopSkeleton(StringRef Prefix);
+ /// Create and return a new IR basic block for the scalar preheader whose name
+ /// is prefixed with \p Prefix.
+ BasicBlock *createScalarPreheader(StringRef Prefix);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
virtual void printDebugTracesAtStart() {}
virtual void printDebugTracesAtEnd() {}
- /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
- /// vector preheader and its predecessor, also connecting the new block to the
- /// scalar preheader.
- void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
-
/// The original loop.
Loop *OrigLoop;
@@ -592,8 +559,6 @@ protected:
/// vector elements.
ElementCount VF;
- ElementCount MinProfitableTripCount;
-
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
@@ -603,18 +568,9 @@ protected:
// --- Vectorization state ---
- /// The vector-loop preheader.
- BasicBlock *LoopVectorPreHeader = nullptr;
-
- /// The scalar-loop preheader.
- BasicBlock *LoopScalarPreHeader = nullptr;
-
/// Trip count of the original loop.
Value *TripCount = nullptr;
- /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
- Value *VectorTripCount = nullptr;
-
/// The profitablity analysis.
LoopVectorizationCostModel *Cost;
@@ -626,11 +582,6 @@ protected:
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
- /// The additional bypass block which conditionally skips over the epilogue
- /// loop after executing the main loop. Needed to resume inductions and
- /// reductions during epilogue vectorization.
- BasicBlock *AdditionalBypassBlock = nullptr;
-
VPlan &Plan;
/// The vector preheader block of \p Plan, used as target for check blocks
@@ -679,20 +630,8 @@ public:
GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
ElementCount MinProfitableTripCount, unsigned UnrollFactor)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
- MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
- Checks, Plan),
- EPI(EPI) {}
-
- // Override this function to handle the more complex control flow around the
- // three loops.
- BasicBlock *createVectorizedLoopSkeleton() final {
- return createEpilogueVectorizedLoopSkeleton();
- }
-
- /// The interface for creating a vectorized skeleton using one of two
- /// different strategies, each corresponding to one execution of the vplan
- /// as described above.
- virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+ UnrollFactor, CM, BFI, PSI, Checks, Plan),
+ EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
/// Holds and updates state information required to vectorize the main loop
/// and its epilogue in two separate passes. This setup helps us avoid
@@ -701,6 +640,9 @@ public:
/// iteration count of the loop is so small that the main vector loop is
/// completely skipped.
EpilogueLoopVectorizationInfo &EPI;
+
+protected:
+ ElementCount MinProfitableTripCount;
};
/// A specialized derived class of inner loop vectorizer that performs
@@ -720,14 +662,24 @@ public:
BFI, PSI, Check, Plan, EPI.MainLoopVF,
EPI.MainLoopVF, EPI.MainLoopUF) {}
/// Implements the interface for creating a vectorized skeleton using the
- /// *main loop* strategy (ie the first pass of vplan execution).
- BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
+ /// *main loop* strategy (i.e., the first pass of VPlan execution).
+ BasicBlock *createVectorizedLoopSkeleton() final;
protected:
+ /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
+ /// vector preheader and its predecessor, also connecting the new block to the
+ /// scalar preheader.
+ void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
+
+ // Create a check to see if the main vector loop should be executed
+ Value *createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF,
+ unsigned UF) const;
+
/// Emits an iteration count bypass check once for the main loop (when \p
/// ForEpilogue is false) and once for the epilogue loop (when \p
/// ForEpilogue is true).
- BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
+ BasicBlock *emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass,
+ bool ForEpilogue);
void printDebugTracesAtStart() override;
void printDebugTracesAtEnd() override;
};
@@ -736,6 +688,11 @@ protected:
// vectorization of *epilogue* loops in the process of vectorizing loops and
// their epilogues.
class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
+ /// The additional bypass block which conditionally skips over the epilogue
+ /// loop after executing the main loop. Needed to resume inductions and
+ /// reductions during epilogue vectorization.
+ BasicBlock *AdditionalBypassBlock = nullptr;
+
public:
EpilogueVectorizerEpilogueLoop(
Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -749,14 +706,22 @@ public:
TripCount = EPI.TripCount;
}
/// Implements the interface for creating a vectorized skeleton using the
- /// *epilogue loop* strategy (ie the second pass of vplan execution).
- BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
+ /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
+ BasicBlock *createVectorizedLoopSkeleton() final;
+
+ /// Return the additional bypass block which targets the scalar loop by
+ /// skipping the epilogue loop after completing the main loop.
+ BasicBlock *getAdditionalBypassBlock() const {
+ assert(AdditionalBypassBlock &&
+ "Trying to access AdditionalBypassBlock but it has not been set");
+ return AdditionalBypassBlock;
+ }
protected:
/// Emits an iteration count bypass check after the main vector loop has
/// finished to see if there are any iterations left to execute by either
/// the vector epilogue or the scalar epilogue.
- BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
+ BasicBlock *emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH,
BasicBlock *Bypass,
BasicBlock *Insert);
void printDebugTracesAtStart() override;
@@ -962,8 +927,8 @@ public:
/// user options, for the given register kind.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
- /// \return True if register pressure should be calculated for the given VF.
- bool shouldCalculateRegPressureForVF(ElementCount VF);
+ /// \return True if register pressure should be considered for the given VF.
+ bool shouldConsiderRegPressureForVF(ElementCount VF);
/// \return The size (in bits) of the smallest and widest types in the code
/// that needs to be vectorized. We ignore values that remain scalar such as
@@ -1159,7 +1124,10 @@ public:
CallWideningDecision getCallWideningDecision(CallInst *CI,
ElementCount VF) const {
assert(!VF.isScalar() && "Expected vector VF");
- return CallWideningDecisions.at({CI, VF});
+ auto I = CallWideningDecisions.find({CI, VF});
+ if (I == CallWideningDecisions.end())
+ return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
+ return I->second;
}
/// Return True if instruction \p I is an optimizable truncate whose operand
@@ -1682,7 +1650,9 @@ private:
Instruction *I = dyn_cast<Instruction>(V);
if (VF.isScalar() || !I || !TheLoop->contains(I) ||
TheLoop->isLoopInvariant(I) ||
- getWideningDecision(I, VF) == CM_Scalarize)
+ getWideningDecision(I, VF) == CM_Scalarize ||
+ (isa<CallInst>(I) &&
+ getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
return false;
// Assume we can vectorize V (and hence we need extraction) if the
@@ -1878,6 +1848,8 @@ public:
"claimed checks are required");
}
+ SCEVExp.eraseDeadInstructions(SCEVCheckCond);
+
if (!MemCheckBlock && !SCEVCheckBlock)
return;
@@ -2030,7 +2002,7 @@ public:
/// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
/// outside VPlan.
- std::pair<Value *, BasicBlock *> getSCEVChecks() {
+ std::pair<Value *, BasicBlock *> getSCEVChecks() const {
using namespace llvm::PatternMatch;
if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
return {nullptr, nullptr};
@@ -2040,7 +2012,7 @@ public:
/// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
/// outside VPlan.
- std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
+ std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
using namespace llvm::PatternMatch;
if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
return {nullptr, nullptr};
@@ -2049,9 +2021,7 @@ public:
/// Return true if any runtime checks have been added
bool hasChecks() const {
- using namespace llvm::PatternMatch;
- return (SCEVCheckCond && !match(SCEVCheckCond, m_ZeroInt())) ||
- MemRuntimeCheckCond;
+ return getSCEVChecks().first || getMemRuntimeChecks().first;
}
};
} // namespace
@@ -2276,7 +2246,8 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
return TTI.enableMaskedInterleavedAccessVectorization();
}
-void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
+void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan(
+ BasicBlock *CheckIRBB) {
// Note: The block with the minimum trip-count check is already connected
// during earlier VPlan construction.
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
@@ -2300,8 +2271,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
}
}
-Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
- unsigned UF) const {
+Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
+ BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
// Generate code to check if the loop's trip count is less than VF * UF, or
// equal to it in case a scalar epilogue is required; this implies that the
// vector trip count is zero. This check also covers the case where adding one
@@ -2312,7 +2283,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+ BasicBlock *const TCCheckBlock = VectorPH;
IRBuilder<InstSimplifyFolder> Builder(
TCCheckBlock->getContext(),
InstSimplifyFolder(TCCheckBlock->getDataLayout()));
@@ -2371,25 +2342,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
return CheckMinIters;
}
-void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
- Value *CheckMinIters = createIterationCountCheck(VF, UF);
- // Create new preheader for vector loop.
- LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
- static_cast<DominatorTree *>(nullptr), LI,
- nullptr, "vector.ph");
-
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
- setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
- ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
-
- assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
- TCCheckBlock &&
- "Plan's entry must be TCCCheckBlock");
-}
-
/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2410,20 +2362,19 @@ static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
return IRVPBB;
}
-void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
- LoopVectorPreHeader = OrigLoop->getLoopPreheader();
- assert(LoopVectorPreHeader && "Invalid loop structure");
+BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
+ BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
+ assert(VectorPH && "Invalid loop structure");
assert((OrigLoop->getUniqueLatchExitBlock() ||
Cost->requiresScalarEpilogue(VF.isVector())) &&
"loops not exiting via the latch without required epilogue?");
- LoopScalarPreHeader =
- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
- LI, nullptr, Twine(Prefix) + "scalar.ph");
// NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
- // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
- // preheader may be unreachable at this point. Instead it is replaced in
- // createVectorizedLoopSkeleton.
+ // wrapping the newly created scalar preheader here at the moment, because the
+ // Plan's scalar preheader may be unreachable at this point. Instead it is
+ // replaced in executePlan.
+ return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
+ Twine(Prefix) + "scalar.ph");
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2464,54 +2415,9 @@ static void addFullyUnrolledInstructionsToIgnore(
}
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
- /*
- In this function we generate a new loop. The new loop will contain
- the vectorized instructions while the old loop will continue to run the
- scalar remainder.
-
- [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
- / | preheader are expanded here. Eventually all required SCEV
- / | expansion should happen here.
- / v
- | [ ] <-- vector loop bypass (may consist of multiple blocks).
- | / |
- | / v
- || [ ] <-- vector pre header.
- |/ |
- | v
- | [ ] \
- | [ ]_| <-- vector loop (created during VPlan execution).
- | |
- | v
- \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
- | | successors created during VPlan execution)
- \/ |
- /\ v
- | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
- | |
- (opt) v <-- edge from middle to exit iff epilogue is not required.
- | [ ] \
- | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
- | | wrapped in VPIRBasicBlock).
- \ |
- \ v
- >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
- ...
- */
-
- // Create an empty vector loop, and prepare basic blocks for the runtime
- // checks.
- createVectorLoopSkeleton("");
-
- // Now, compare the new count to zero. If it is zero skip the vector loop and
- // jump to the scalar loop. This check also covers the case where the
- // backedge-taken count is uint##_max: adding one to it will overflow leading
- // to an incorrect trip count of zero. In this (rare) case we will also jump
- // to the scalar loop.
- emitIterationCountCheck(LoopScalarPreHeader);
-
- replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
- return LoopVectorPreHeader;
+ // Create a new IR basic block for the scalar preheader.
+ BasicBlock *ScalarPH = createScalarPreheader("");
+ return ScalarPH->getSinglePredecessor();
}
namespace {
@@ -2652,24 +2558,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Remove redundant induction instructions.
cse(HeaderBB);
-
- // Set/update profile weights for the vector and remainder loops as original
- // loop iterations are now distributed among them. Note that original loop
- // becomes the scalar remainder loop after vectorization.
- //
- // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
- // end up getting slightly roughened result but that should be OK since
- // profile is not inherently precise anyway. Note also possible bypass of
- // vector code caused by legality checks is ignored, assigning all the weight
- // to the vector loop, optimistically.
- //
- // For scalable vectorization we can't know at compile time how many
- // iterations of the loop are handled in one vector iteration, so instead
- // use the value of vscale used for tuning.
- Loop *VectorLoop = LI->getLoopFor(HeaderBB);
- unsigned EstimatedVFxUF =
- estimateElementCount(VF * UF, Cost->getVScaleForTuning());
- setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
}
void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -3020,19 +2908,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
toVectorTy(Type::getInt1Ty(I->getContext()), VF),
CmpInst::BAD_ICMP_PREDICATE, CostKind);
- // Certain instructions can be cheaper to vectorize if they have a constant
- // second vector operand. One example of this are shifts on x86.
- Value *Op2 = I->getOperand(1);
- auto Op2Info = TTI.getOperandInfo(Op2);
- if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
- Legal->isInvariant(Op2))
- Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
-
SmallVector<const Value *, 4> Operands(I->operand_values());
SafeDivisorCost += TTI.getArithmeticInstrCost(
- I->getOpcode(), VecTy, CostKind,
- {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
- Op2Info, Operands, I);
+ I->getOpcode(), VecTy, CostKind,
+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+ Operands, I);
return {ScalarizationCost, SafeDivisorCost};
}
@@ -3810,7 +3691,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}
-bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
+bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF(
ElementCount VF) {
if (!useMaxBandwidth(VF.isScalable()
? TargetTransformInfo::RGK_ScalableVector
@@ -3939,7 +3820,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
const unsigned MaxTripCount,
- bool HasTail) const {
+ bool HasTail,
+ bool IsEpilogue) const {
InstructionCost CostA = A.Cost;
InstructionCost CostB = B.Cost;
@@ -3963,7 +3845,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
// Assume vscale may be larger than 1 (or the value being tuned for),
// so that scalable vectorization is slightly favorable over fixed-width
// vectorization.
- bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
+ bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
A.Width.isScalable() && !B.Width.isScalable();
auto CmpFn = [PreferScalable](const InstructionCost &LHS,
@@ -4001,10 +3883,11 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
- bool HasTail) const {
+ bool HasTail,
+ bool IsEpilogue) const {
const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
- return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
- HasTail);
+ return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
+ IsEpilogue);
}
void LoopVectorizationPlanner::emitInvalidCostRemarks(
@@ -4171,6 +4054,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntOrFpInductionSC:
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPReductionPHISC:
+ case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
@@ -4199,8 +4083,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
// If no def nor is a store, e.g., branches, continue - no value to check.
if (R.getNumDefinedValues() == 0 &&
- !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
- &R))
+ !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(&R))
continue;
// For multi-def recipes, currently only interleaved loads, suffice to
// check first def only.
@@ -4255,8 +4138,9 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
P->vectorFactors().end());
SmallVector<VPRegisterUsage, 8> RUs;
- if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
- CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+ if (any_of(VFs, [this](ElementCount VF) {
+ return CM.shouldConsiderRegPressureForVF(VF);
+ }))
RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
for (unsigned I = 0; I < VFs.size(); I++) {
@@ -4268,7 +4152,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
/// If the register pressure needs to be considered for VF,
/// don't consider the VF as valid if it exceeds the number
/// of registers for the target.
- if (CM.shouldCalculateRegPressureForVF(VF) &&
+ if (CM.shouldConsiderRegPressureForVF(VF) &&
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
continue;
@@ -4286,7 +4170,33 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
if (!VPI)
continue;
switch (VPI->getOpcode()) {
- case VPInstruction::ActiveLaneMask:
+ // Selects are only modelled in the legacy cost model for safe
+ // divisors.
+ case Instruction::Select: {
+ VPValue *VPV = VPI->getVPSingleValue();
+ if (VPV->getNumUsers() == 1) {
+ if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) {
+ switch (WR->getOpcode()) {
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ continue;
+ default:
+ break;
+ }
+ }
+ }
+ C += VPI->cost(VF, CostCtx);
+ break;
+ }
+ case VPInstruction::ActiveLaneMask: {
+ unsigned Multiplier =
+ cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
+ ->getZExtValue();
+ C += VPI->cost(VF * Multiplier, CostCtx);
+ break;
+ }
case VPInstruction::ExplicitVectorLength:
C += VPI->cost(VF, CostCtx);
break;
@@ -4511,7 +4421,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
}
if (Result.Width.isScalar() ||
- isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
+ isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
+ /*IsEpilogue*/ true))
Result = NextVF;
}
@@ -5326,8 +5237,11 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
- const Value *Ptr = getLoadStorePointerOperand(I);
- Type *PtrTy = toVectorTy(Ptr->getType(), VF);
+ Value *Ptr = getLoadStorePointerOperand(I);
+ Type *PtrTy = Ptr->getType();
+
+ if (!Legal->isUniform(Ptr, VF))
+ PtrTy = toVectorTy(PtrTy, VF);
return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
@@ -5483,7 +5397,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
TTI::CastContextHint::None, CostKind, RedOp);
InstructionCost RedCost = TTI.getMulAccReductionCost(
- IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
+ IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
+ CostKind);
if (RedCost.isValid() &&
RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
@@ -5528,7 +5443,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
InstructionCost RedCost = TTI.getMulAccReductionCost(
- IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
+ IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
+ CostKind);
InstructionCost ExtraExtCost = 0;
if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
@@ -5547,7 +5463,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
InstructionCost RedCost = TTI.getMulAccReductionCost(
- true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
+ true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
+ CostKind);
if (RedCost.isValid() && RedCost < MulCost + BaseCost)
return I == RetI ? RedCost : 0;
@@ -6262,10 +6179,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
assert(Op0->getType()->getScalarSizeInBits() == 1 &&
Op1->getType()->getScalarSizeInBits() == 1);
- SmallVector<const Value *, 2> Operands{Op0, Op1};
return TTI.getArithmeticInstrCost(
- match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
- CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
+ match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
+ VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I);
}
Type *CondTy = SI->getCondition()->getType();
@@ -6495,7 +6411,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}))
continue;
VecValuesToIgnore.insert(Op);
- DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
+ append_range(DeadInterleavePointerOps, Op->operands());
}
for (const auto &[_, Ops] : DeadInvariantStoreOps)
@@ -6555,7 +6471,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
ValuesToIgnore.insert(Op);
VecValuesToIgnore.insert(Op);
- DeadOps.append(Op->op_begin(), Op->op_end());
+ append_range(DeadOps, Op->operands());
}
// Ignore type-promoting instructions we identified during reduction
@@ -6765,9 +6681,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
ElementCount VF) const {
- if (ForceTargetInstructionCost.getNumOccurrences())
- return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
- return CM.getInstructionCost(UI, VF);
+ InstructionCost Cost = CM.getInstructionCost(UI, VF);
+ if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
+ return InstructionCost(ForceTargetInstructionCost);
+ return Cost;
}
bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
@@ -7071,8 +6988,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
P->vectorFactors().end());
SmallVector<VPRegisterUsage, 8> RUs;
- if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
- CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+ if (any_of(VFs, [this](ElementCount VF) {
+ return CM.shouldConsiderRegPressureForVF(VF);
+ }))
RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
for (unsigned I = 0; I < VFs.size(); I++) {
@@ -7098,7 +7016,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
InstructionCost Cost = cost(*P, VF);
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
- if (CM.shouldCalculateRegPressureForVF(VF) &&
+ if (CM.shouldConsiderRegPressureForVF(VF) &&
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
<< VF << " because it uses too many registers\n");
@@ -7146,40 +7064,6 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
return BestFactor;
}
-static void addRuntimeUnrollDisableMetaData(Loop *L) {
- SmallVector<Metadata *, 4> MDs;
- // Reserve first location for self reference to the LoopID metadata node.
- MDs.push_back(nullptr);
- bool IsUnrollMetadata = false;
- MDNode *LoopID = L->getLoopID();
- if (LoopID) {
- // First find existing loop unrolling disable metadata.
- for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
- auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
- if (MD) {
- const auto *S = dyn_cast<MDString>(MD->getOperand(0));
- IsUnrollMetadata =
- S && S->getString().starts_with("llvm.loop.unroll.disable");
- }
- MDs.push_back(LoopID->getOperand(I));
- }
- }
-
- if (!IsUnrollMetadata) {
- // Add runtime unroll disable metadata.
- LLVMContext &Context = L->getHeader()->getContext();
- SmallVector<Metadata *, 1> DisableOperands;
- DisableOperands.push_back(
- MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
- MDNode *DisableNode = MDNode::get(Context, DisableOperands);
- MDs.push_back(DisableNode);
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
- L->setLoopID(NewLoopID);
- }
-}
-
static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
using namespace VPlanPatternMatch;
assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
@@ -7193,7 +7077,7 @@ static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
// from the main vector loop.
static void fixReductionScalarResumeWhenVectorizingEpilog(
- VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) {
+ VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
// Get the VPInstruction computing the reduction result in the middle block.
// The first operand may not be from the middle block if it is not connected
// to the scalar preheader. In that case, there's nothing to fix.
@@ -7248,8 +7132,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
// When fixing reductions in the epilogue loop we should already have
// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
// over the incoming values correctly.
- auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiR, true));
- EpiResumePhi->setIncomingValueForBlock(
+ EpiResumePhi.setIncomingValueForBlock(
BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
}
@@ -7276,11 +7159,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan, BestVF, VScale);
}
- if (!VectorizingEpilogue) {
- // Checks are the same for all VPlans, added to BestVPlan only for
- // compactness.
- attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
- }
+ // Checks are the same for all VPlans, added to BestVPlan only for
+ // compactness.
+ attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
// Retrieving VectorPH now when it's easier while VPlan still has Regions.
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
@@ -7291,6 +7172,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::narrowInterleaveGroups(
BestVPlan, BestVF,
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
+ VPlanTransforms::cse(BestVPlan);
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -7327,8 +7209,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
- BasicBlock *EntryBB =
- cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(),
State.CFG.PrevBB->getSingleSuccessor());
@@ -7342,7 +7222,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// looked through single-entry phis.
ScalarEvolution &SE = *PSE.getSE();
for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
- if (Exit->getNumPredecessors() == 0)
+ if (!Exit->hasPredecessors())
continue;
for (VPRecipeBase &PhiR : Exit->phis())
SE.forgetLcssaPhiWithNewPredecessor(
@@ -7362,88 +7242,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
//
//===------------------------------------------------===//
- // Move check blocks to their final position.
- // TODO: Move as part of VPIRBB execute and update impacted tests.
- if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
- MemCheckBlock->moveAfter(EntryBB);
- if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
- SCEVCheckBlock->moveAfter(EntryBB);
-
BestVPlan.execute(&State);
- // 2.5 When vectorizing the epilogue, fix reduction resume values from the
- // additional bypass block.
- if (VectorizingEpilogue) {
- assert(!BestVPlan.hasEarlyExit() &&
- "Epilogue vectorisation not yet supported with early exits");
- BasicBlock *PH = OrigLoop->getLoopPreheader();
- BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
- for (auto *Pred : predecessors(PH)) {
- for (PHINode &Phi : PH->phis()) {
- if (Phi.getBasicBlockIndex(Pred) != -1)
- continue;
- Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
- }
- }
- VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
- if (ScalarPH->getNumPredecessors() > 0) {
- // If ScalarPH has predecessors, we may need to update its reduction
- // resume values.
- for (VPRecipeBase &R : ScalarPH->phis()) {
- fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), State,
- BypassBlock);
- }
- }
- }
-
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
- if (HeaderVPBB) {
- MDNode *OrigLoopID = OrigLoop->getLoopID();
-
- std::optional<MDNode *> VectorizedLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
- LLVMLoopVectorizeFollowupVectorized});
-
- Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
- if (VectorizedLoopID) {
- L->setLoopID(*VectorizedLoopID);
- } else {
- // Keep all loop hints from the original loop on the vector loop (we'll
- // replace the vectorizer-specific hints below).
- if (MDNode *LID = OrigLoop->getLoopID())
- L->setLoopID(LID);
-
- LoopVectorizeHints Hints(L, true, *ORE);
- Hints.setAlreadyVectorized();
-
- // Check if it's EVL-vectorized and mark the corresponding metadata.
- bool IsEVLVectorized =
- llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) {
- // Looking for the ExplictVectorLength VPInstruction.
- if (const auto *VI = dyn_cast<VPInstruction>(&Recipe))
- return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
- return false;
- });
- if (IsEVLVectorized) {
- LLVMContext &Context = L->getHeader()->getContext();
- MDNode *LoopID = L->getLoopID();
- auto *IsEVLVectorizedMD = MDNode::get(
- Context,
- {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"),
- MDString::get(Context, "evl")});
- MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {},
- {IsEVLVectorizedMD});
- L->setLoopID(NewLoopID);
- }
- }
- TargetTransformInfo::UnrollingPreferences UP;
- TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
- if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
- addRuntimeUnrollDisableMetaData(L);
- }
+ // Add metadata to disable runtime unrolling a scalar loop when there
+ // are no runtime checks about strides and memory. A scalar loop that is
+ // rarely used is not worth unrolling.
+ bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
+ updateLoopMetadataAndProfileInfo(
+ HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
+ : nullptr,
+ HeaderVPBB, VectorizingEpilogue,
+ estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()),
+ DisableRuntimeUnroll);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
@@ -7460,15 +7274,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("");
+BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() {
+ BasicBlock *ScalarPH = createScalarPreheader("");
+ BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
// Generate the code to check the minimum iteration count of the vector
// epilogue (see below).
EPI.EpilogueIterationCountCheck =
- emitIterationCountCheck(LoopScalarPreHeader, true);
+ emitIterationCountCheck(VectorPH, ScalarPH, true);
EPI.EpilogueIterationCountCheck->setName("iter.check");
+ VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator())
+ ->getSuccessor(1);
// Generate the iteration count check for the main loop, *after* the check
// for the epilogue loop, so that the path-length is shorter for the case
// that goes directly through the vector epilogue. The longer-path length for
@@ -7476,9 +7293,10 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
// trip count. Note: the branch will get updated later on when we vectorize
// the epilogue.
EPI.MainLoopIterationCountCheck =
- emitIterationCountCheck(LoopScalarPreHeader, false);
+ emitIterationCountCheck(VectorPH, ScalarPH, false);
- return LoopVectorPreHeader;
+ return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator())
+ ->getSuccessor(1);
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7498,35 +7316,33 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
});
}
-BasicBlock *
-EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
- bool ForEpilogue) {
+BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
+ BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
assert(Bypass && "Expected valid bypass basic block.");
Value *Count = getTripCount();
MinProfitableTripCount = ElementCount::getFixed(0);
- Value *CheckMinIters =
- createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
- ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
+ Value *CheckMinIters = createIterationCountCheck(
+ VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
+ ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+ BasicBlock *const TCCheckBlock = VectorPH;
if (!ForEpilogue)
TCCheckBlock->setName("vector.main.loop.iter.check");
// Create new preheader for vector loop.
- LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
- static_cast<DominatorTree *>(nullptr), LI,
- nullptr, "vector.ph");
+ VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
+ static_cast<DominatorTree *>(nullptr), LI, nullptr,
+ "vector.ph");
if (ForEpilogue) {
// Save the trip count so we don't have to regenerate it in the
// vec.epilog.iter.check. This is safe to do because the trip count
// generated here dominates the vector epilog iter check.
EPI.TripCount = Count;
} else {
- VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
}
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
+ BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
@@ -7546,19 +7362,18 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *
-EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("vec.epilog.");
-
+BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
+ BasicBlock *ScalarPH = createScalarPreheader("vec.epilog.");
+ BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
// Now, compare the remaining count and if there aren't enough iterations to
// execute the vectorized epilogue skip to the scalar part.
- LoopVectorPreHeader->setName("vec.epilog.ph");
+ VectorPH->setName("vec.epilog.ph");
BasicBlock *VecEpilogueIterationCountCheck =
- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
- nullptr, "vec.epilog.iter.check", true);
- VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ SplitBlock(VectorPH, VectorPH->begin(), DT, LI, nullptr,
+ "vec.epilog.iter.check", true);
+ VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
- emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
+ emitMinimumVectorEpilogueIterCountCheck(VectorPH, ScalarPH,
VecEpilogueIterationCountCheck);
AdditionalBypassBlock = VecEpilogueIterationCountCheck;
@@ -7567,23 +7382,22 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
"expected this to be saved from the previous pass.");
EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, LoopVectorPreHeader);
+ VecEpilogueIterationCountCheck, VectorPH);
EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ VecEpilogueIterationCountCheck, ScalarPH);
// Adjust the terminators of runtime check blocks and phis using them.
BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
if (SCEVCheckBlock)
SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ VecEpilogueIterationCountCheck, ScalarPH);
if (MemCheckBlock)
MemCheckBlock->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+ VecEpilogueIterationCountCheck, ScalarPH);
- DT->changeImmediateDominator(LoopScalarPreHeader,
- EPI.EpilogueIterationCountCheck);
+ DT->changeImmediateDominator(ScalarPH, EPI.EpilogueIterationCountCheck);
// The vec.epilog.iter.check block may contain Phi nodes from inductions or
// reductions which merge control-flow from the latch block and the middle
@@ -7592,7 +7406,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
for (PHINode *Phi : PhisInBlock) {
- Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
+ Phi->moveBefore(VectorPH->getFirstNonPHIIt());
Phi->replaceIncomingBlockWith(
VecEpilogueIterationCountCheck->getSinglePredecessor(),
VecEpilogueIterationCountCheck);
@@ -7612,12 +7426,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
Phi->removeIncomingValue(MemCheckBlock);
}
- return LoopVectorPreHeader;
+ return VectorPH;
}
BasicBlock *
EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
- BasicBlock *Bypass, BasicBlock *Insert) {
+ BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert) {
assert(EPI.TripCount &&
"Expected trip count to have been saved in the first pass.");
@@ -7637,23 +7451,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
EPI.EpilogueVF, EPI.EpilogueUF),
"min.epilog.iters.check");
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
- auto VScale = Cost->getVScaleForTuning();
- unsigned MainLoopStep =
- estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
- unsigned EpilogueLoopStep =
- estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
- // We assume the remaining `Count` is equally distributed in
- // [0, MainLoopStep)
- // So the probability for `Count < EpilogueLoopStep` should be
- // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
- unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
- const uint32_t Weights[] = {EstimatedSkipCount,
- MainLoopStep - EstimatedSkipCount};
- setBranchWeights(BI, Weights, /*IsExpected=*/false);
- }
+ BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
+ auto VScale = Cost->getVScaleForTuning();
+ unsigned MainLoopStep =
+ estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
+ unsigned EpilogueLoopStep =
+ estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
+ // We assume the remaining `Count` is equally distributed in
+ // [0, MainLoopStep)
+ // So the probability for `Count < EpilogueLoopStep` should be
+ // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
+ // TODO: Improve the estimate by taking the estimated trip count into
+ // consideration.
+ unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
+ const uint32_t Weights[] = {EstimatedSkipCount,
+ MainLoopStep - EstimatedSkipCount};
+ setBranchWeights(BI, Weights, /*IsExpected=*/false);
ReplaceInstWithInst(Insert->getTerminator(), &BI);
// A new entry block has been created for the epilogue VPlan. Hook it in, as
@@ -8634,8 +8447,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
return !CM.requiresScalarEpilogue(VF.isVector());
},
Range);
- VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(),
- Range);
+ VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
CM.foldTailByMasking());
@@ -8761,10 +8573,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPRecipeBase *Recipe =
RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
- if (!Recipe) {
- SmallVector<VPValue *, 4> Operands(R.operands());
- Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
- }
+ if (!Recipe)
+ Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range);
RecipeBuilder.setRecipe(Instr, Recipe);
if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
@@ -8790,7 +8600,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// to remove the need to keep a map of masks beyond the predication
// transform.
RecipeBuilder.updateBlockMaskCache(Old2New);
- for (const auto &[Old, _] : Old2New)
+ for (VPValue *Old : Old2New.keys())
Old->getDefiningRecipe()->eraseFromParent();
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
@@ -8851,41 +8661,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
InterleaveGroups, RecipeBuilder,
CM.isScalarEpilogueAllowed());
- // Replace VPValues for known constant strides guaranteed by predicate scalar
- // evolution.
- auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
- auto *R = cast<VPRecipeBase>(&U);
- return R->getParent()->getParent() ||
- R->getParent() ==
- Plan->getVectorLoopRegion()->getSinglePredecessor();
- };
- for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
- auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
- auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
- // Only handle constant strides for now.
- if (!ScevStride)
- continue;
-
- auto *CI = Plan->getOrAddLiveIn(
- ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
- if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
- StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
-
- // The versioned value may not be used in the loop directly but through a
- // sext/zext. Add new live-ins in those cases.
- for (Value *U : StrideV->users()) {
- if (!isa<SExtInst, ZExtInst>(U))
- continue;
- VPValue *StrideVPV = Plan->getLiveIn(U);
- if (!StrideVPV)
- continue;
- unsigned BW = U->getType()->getScalarSizeInBits();
- APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
- : ScevStride->getAPInt().zext(BW);
- VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
- StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
- }
- }
+ // Replace VPValues for known constant strides.
+ VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
+ Legal->getLAI()->getSymbolicStrides());
auto BlockNeedsPredication = [this](BasicBlock *BB) {
return Legal->blockNeedsPredication(BB);
@@ -8926,7 +8704,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
OrigLoop, *LI, Legal->getWidestInductionType(),
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
VPlanTransforms::handleEarlyExits(*Plan,
- /*HasUncountableExit*/ false, Range);
+ /*HasUncountableExit*/ false);
VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
/*TailFolded*/ false);
@@ -9316,7 +9094,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
void LoopVectorizationPlanner::attachRuntimeChecks(
VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
- if (SCEVCheckBlock) {
+ if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
assert((!CM.OptForSize ||
CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
"Cannot SCEV check stride or overflow when optimizing for size");
@@ -9324,7 +9102,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
HasBranchWeights);
}
const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
- if (MemCheckBlock) {
+ if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
// VPlan-native path does not do any analysis for runtime checks
// currently.
assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
@@ -9350,6 +9128,29 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
}
}
+void LoopVectorizationPlanner::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const {
+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
+ // an overflow to zero when updating induction variables and so an
+ // additional overflow check is required before entering the vector loop.
+ bool IsIndvarOverflowCheckNeededForVF =
+ VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
+ !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
+ CM.getTailFoldingStyle() !=
+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+ const uint32_t *BranchWeigths =
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
+ ? &MinItersBypassWeights[0]
+ : nullptr;
+ VPlanTransforms::addMinimumIterationCheck(
+ Plan, VF, UF, MinProfitableTripCount,
+ CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
+ IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
+ OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
+ *PSE.getSE());
+}
+
void VPDerivedIVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
@@ -9465,17 +9266,18 @@ static bool processLoopInVPlanNativePath(
{
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM,
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
- LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
+ VF.MinProfitableTripCount);
+
+ LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false);
}
reportVectorization(ORE, L, VF, 1);
- // Mark the loop as already vectorized to avoid vectorizing again.
- Hints.setAlreadyVectorized();
assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
return true;
}
@@ -9929,6 +9731,43 @@ static Value *createInductionAdditionalBypassValues(
return EndValueFromAdditionalBypass;
}
+static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
+ VPlan &BestEpiPlan,
+ LoopVectorizationLegality &LVL,
+ const SCEV2ValueTy &ExpandedSCEVs,
+ Value *MainVectorTripCount) {
+ // Fix reduction resume values from the additional bypass block.
+ BasicBlock *PH = L->getLoopPreheader();
+ for (auto *Pred : predecessors(PH)) {
+ for (PHINode &Phi : PH->phis()) {
+ if (Phi.getBasicBlockIndex(Pred) != -1)
+ continue;
+ Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
+ }
+ }
+ auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
+ if (ScalarPH->hasPredecessors()) {
+ // If ScalarPH has predecessors, we may need to update its reduction
+ // resume values.
+ for (const auto &[R, IRPhi] :
+ zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) {
+ fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), IRPhi,
+ BypassBlock);
+ }
+ }
+
+ // Fix induction resume values from the additional bypass block.
+ IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
+ for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+ Value *V = createInductionAdditionalBypassValues(
+ IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
+ LVL.getPrimaryInduction());
+ // TODO: Directly add as extra operand to the VPResumePHI recipe.
+ Inc->setIncomingValueForBlock(BypassBlock, V);
+ }
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->isInnermost()) &&
"VPlan-native path is not enabled. Only process inner loops.");
@@ -9971,7 +9810,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements;
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
- &Requirements, &Hints, DB, AC, BFI, PSI);
+ &Requirements, &Hints, DB, AC, BFI, PSI, AA);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
Hints.emitRemarkWithHints();
@@ -9985,6 +9824,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
+ if (!LVL.getPotentiallyFaultingLoads().empty()) {
+ reportVectorizationFailure("Auto-vectorization of loops with potentially "
+ "faulting load is not supported",
+ "PotentiallyFaultingLoadsNotSupported", ORE, L);
+ return false;
+ }
+
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
// even evaluating whether vectorization is profitable. Since we cannot modify
@@ -10251,128 +10097,80 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
- bool DisableRuntimeUnroll = false;
- MDNode *OrigLoopID = L->getLoopID();
- {
+ // Report the vectorization decision.
+ if (VF.Width.isScalar()) {
using namespace ore;
- if (!VectorizeLoop) {
- assert(IC > 1 && "interleave count should not be 1 or 0");
- // If we decided that it is not legal to vectorize the loop, then
- // interleave it.
- VPlan &BestPlan = LVP.getPlanFor(VF.Width);
- InnerLoopVectorizer Unroller(
- L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1),
- ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
-
- // TODO: Move to general VPlan pipeline once epilogue loops are also
- // supported.
- VPlanTransforms::runPass(
- VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
- VF.Width, IC, PSE);
-
- LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
+ assert(IC > 1);
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")";
+ });
+ } else {
+ // Report the vectorization decision.
+ reportVectorization(ORE, L, VF, IC);
+ }
+ if (ORE->allowExtraAnalysis(LV_NAME))
+ checkMixedPrecision(L, ORE);
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
- L->getHeader())
- << "interleaved loop (interleaved count: "
- << NV("InterleaveCount", IC) << ")";
- });
- } else {
- // If we decided that it is *legal* to vectorize the loop, then do it.
-
- VPlan &BestPlan = LVP.getPlanFor(VF.Width);
- // Consider vectorizing the epilogue too if it's profitable.
- VectorizationFactor EpilogueVF =
- LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
- if (EpilogueVF.Width.isVector()) {
- std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
-
- // The first pass vectorizes the main loop and creates a scalar epilogue
- // to be vectorized by executing the plan (potentially with a different
- // factor) again shortly afterwards.
- VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
- BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
- preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
- EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
- BestEpiPlan);
- EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
- BFI, PSI, Checks, *BestMainPlan);
- auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
- *BestMainPlan, MainILV, DT, false);
- ++LoopsVectorized;
-
- // Second pass vectorizes the epilogue and adjusts the control flow
- // edges from the first pass.
- EpilogueVectorizerEpilogueLoop EpilogILV(
- L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan);
- EpilogILV.setTripCount(MainILV.getTripCount());
- preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
-
- LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
- DT, true);
-
- // Fix induction resume values from the additional bypass block.
- BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
- IRBuilder<> BypassBuilder(BypassBlock,
- BypassBlock->getFirstInsertionPt());
- BasicBlock *PH = L->getLoopPreheader();
- for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
- auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
- Value *V = createInductionAdditionalBypassValues(
- IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount,
- LVL.getPrimaryInduction());
- // TODO: Directly add as extra operand to the VPResumePHI recipe.
- Inc->setIncomingValueForBlock(BypassBlock, V);
- }
- ++LoopsEpilogueVectorized;
+ // If we decided that it is *legal* to interleave or vectorize the loop, then
+ // do it.
- if (!Checks.hasChecks())
- DisableRuntimeUnroll = true;
- } else {
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width,
- VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
- Checks, BestPlan);
- // TODO: Move to general VPlan pipeline once epilogue loops are also
- // supported.
- VPlanTransforms::runPass(
- VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
- VF.Width, IC, PSE);
-
- LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
- ++LoopsVectorized;
-
- // Add metadata to disable runtime unrolling a scalar loop when there
- // are no runtime checks about strides and memory. A scalar loop that is
- // rarely used is not worth unrolling.
- if (!Checks.hasChecks())
- DisableRuntimeUnroll = true;
- }
- // Report the vectorization decision.
- reportVectorization(ORE, L, VF, IC);
- }
+ VPlan &BestPlan = LVP.getPlanFor(VF.Width);
+ // Consider vectorizing the epilogue too if it's profitable.
+ VectorizationFactor EpilogueVF =
+ LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
+ if (EpilogueVF.Width.isVector()) {
+ std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
+
+ // The first pass vectorizes the main loop and creates a scalar epilogue
+ // to be vectorized by executing the plan (potentially with a different
+ // factor) again shortly afterwards.
+ VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+ BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
+ preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
+ EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
+ BestEpiPlan);
+ EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI,
+ PSI, Checks, *BestMainPlan);
+ auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
+ *BestMainPlan, MainILV, DT, false);
+ ++LoopsVectorized;
+
+ // Second pass vectorizes the epilogue and adjusts the control flow
+ // edges from the first pass.
+ EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
+ BFI, PSI, Checks, BestEpiPlan);
+ EpilogILV.setTripCount(MainILV.getTripCount());
+ preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
+
+ LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
+ true);
+
+ fixScalarResumeValuesFromBypass(EpilogILV.getAdditionalBypassBlock(), L,
+ BestEpiPlan, LVL, ExpandedSCEVs,
+ EPI.VectorTripCount);
+ ++LoopsEpilogueVectorized;
+ } else {
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI,
+ Checks, BestPlan);
+ // TODO: Move to general VPlan pipeline once epilogue loops are also
+ // supported.
+ VPlanTransforms::runPass(
+ VPlanTransforms::materializeConstantVectorTripCount, BestPlan, VF.Width,
+ IC, PSE);
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
+ VF.MinProfitableTripCount);
- if (ORE->allowExtraAnalysis(LV_NAME))
- checkMixedPrecision(L, ORE);
+ LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+ ++LoopsVectorized;
}
assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
"DT not preserved correctly");
+ assert(!verifyFunction(*F, &dbgs()));
- std::optional<MDNode *> RemainderLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
- LLVMLoopVectorizeFollowupEpilogue});
- if (RemainderLoopID) {
- L->setLoopID(*RemainderLoopID);
- } else {
- if (DisableRuntimeUnroll)
- addRuntimeUnrollDisableMetaData(L);
-
- // Mark the loop as already vectorized to avoid vectorizing again.
- Hints.setAlreadyVectorized();
- }
-
- assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
return true;
}
@@ -10449,6 +10247,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
DB = &AM.getResult<DemandedBitsAnalysis>(F);
ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
LAIs = &AM.getResult<LoopAccessAnalysis>(F);
+ AA = &AM.getResult<AAManager>(F);
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 37dc41413966..6a56dbfaa015 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -967,9 +967,7 @@ class BinOpSameOpcodeHelper {
return false;
}
bool equal(unsigned Opcode) {
- if (Opcode == I->getOpcode())
- return trySet(MainOpBIT, MainOpBIT);
- return false;
+ return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
}
unsigned getOpcode() const {
MaskType Candidate = Mask & SeenBefore;
@@ -5576,7 +5574,23 @@ private:
if (auto *SD = dyn_cast<ScheduleData>(Data)) {
SD->setScheduled(/*Scheduled=*/true);
LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
- ProcessBundleMember(SD, {});
+ SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles;
+ SmallVector<ScheduleBundle *> Bundles;
+ Instruction *In = SD->getInst();
+ if (R.isVectorized(In)) {
+ ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
+ for (TreeEntry *TE : Entries) {
+ if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(In) &&
+ In->getNumOperands() != TE->getNumOperands())
+ continue;
+ auto &BundlePtr =
+ PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
+ BundlePtr->setTreeEntry(TE);
+ BundlePtr->add(SD);
+ Bundles.push_back(BundlePtr.get());
+ }
+ }
+ ProcessBundleMember(SD, Bundles);
} else {
ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
Bundle.setScheduled(/*Scheduled=*/true);
@@ -6325,17 +6339,11 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
}
/// Checks if the provided list of pointers \p Pointers represents the strided
-/// pointers for type ElemTy. If they are not, std::nullopt is returned.
-/// Otherwise, if \p Inst is not specified, just initialized optional value is
-/// returned to show that the pointers represent strided pointers. If \p Inst
-/// specified, the runtime stride is materialized before the given \p Inst.
-/// \returns std::nullopt if the pointers are not pointers with the runtime
-/// stride, nullptr or actual stride value, otherwise.
-static std::optional<Value *>
-calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
- const DataLayout &DL, ScalarEvolution &SE,
- SmallVectorImpl<unsigned> &SortedIndices,
- Instruction *Inst = nullptr) {
+/// pointers for type ElemTy. If they are not, nullptr is returned.
+/// Otherwise, SCEV* of the stride value is returned.
+static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<unsigned> &SortedIndices) {
SmallVector<const SCEV *> SCEVs;
const SCEV *PtrSCEVLowest = nullptr;
const SCEV *PtrSCEVHighest = nullptr;
@@ -6344,7 +6352,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
for (Value *Ptr : PointerOps) {
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
if (!PtrSCEV)
- return std::nullopt;
+ return nullptr;
SCEVs.push_back(PtrSCEV);
if (!PtrSCEVLowest && !PtrSCEVHighest) {
PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
@@ -6352,14 +6360,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
}
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Diff))
- return std::nullopt;
+ return nullptr;
if (Diff->isNonConstantNegative()) {
PtrSCEVLowest = PtrSCEV;
continue;
}
const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
if (isa<SCEVCouldNotCompute>(Diff1))
- return std::nullopt;
+ return nullptr;
if (Diff1->isNonConstantNegative()) {
PtrSCEVHighest = PtrSCEV;
continue;
@@ -6368,7 +6376,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
// Dist = PtrSCEVHighest - PtrSCEVLowest;
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Dist))
- return std::nullopt;
+ return nullptr;
int Size = DL.getTypeStoreSize(ElemTy);
auto TryGetStride = [&](const SCEV *Dist,
const SCEV *Multiplier) -> const SCEV * {
@@ -6389,10 +6397,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
Stride = TryGetStride(Dist, Sz);
if (!Stride)
- return std::nullopt;
+ return nullptr;
}
if (!Stride || isa<SCEVConstant>(Stride))
- return std::nullopt;
+ return nullptr;
// Iterate through all pointers and check if all distances are
// unique multiple of Stride.
using DistOrdPair = std::pair<int64_t, int>;
@@ -6406,28 +6414,28 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
const SCEV *Coeff = TryGetStride(Diff, Stride);
if (!Coeff)
- return std::nullopt;
+ return nullptr;
const auto *SC = dyn_cast<SCEVConstant>(Coeff);
if (!SC || isa<SCEVCouldNotCompute>(SC))
- return std::nullopt;
+ return nullptr;
if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
SE.getMulExpr(Stride, SC)))
->isZero())
- return std::nullopt;
+ return nullptr;
Dist = SC->getAPInt().getZExtValue();
}
// If the strides are not the same or repeated, we can't vectorize.
if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
- return std::nullopt;
+ return nullptr;
auto Res = Offsets.emplace(Dist, Cnt);
if (!Res.second)
- return std::nullopt;
+ return nullptr;
// Consecutive order if the inserted element is the last one.
IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
++Cnt;
}
if (Offsets.size() != SCEVs.size())
- return std::nullopt;
+ return nullptr;
SortedIndices.clear();
if (!IsConsecutive) {
// Fill SortedIndices array only if it is non-consecutive.
@@ -6438,10 +6446,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
++Cnt;
}
}
- if (!Inst)
- return nullptr;
- SCEVExpander Expander(SE, DL, "strided-load-vec");
- return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
+ return Stride;
}
static std::pair<InstructionCost, InstructionCost>
@@ -8030,11 +8035,11 @@ void BoUpSLP::reorderTopToBottom() {
// it is an attempt to reorder node with reused scalars but with
// external uses.
if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
- OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
+ OrdersUses.try_emplace(OrdersType(), 0).first->second +=
ExternalUserReorderIndices.size();
} else {
for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
- ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
+ ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
}
// No other useful reorder data in this entry.
if (Order.empty())
@@ -8054,9 +8059,9 @@ void BoUpSLP::reorderTopToBottom() {
return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
- ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+ ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
} else {
- ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+ ++OrdersUses.try_emplace(Order, 0).first->second;
}
}
if (OrdersUses.empty())
@@ -8480,12 +8485,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
});
fixupOrderingIndices(CurrentOrder);
- OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
- NumOps;
+ OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
} else {
- OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
+ OrdersUses.try_emplace(Order, 0).first->second += NumOps;
}
- auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
+ auto Res = OrdersUses.try_emplace(OrdersType(), 0);
const auto AllowsReordering = [&](const TreeEntry *TE) {
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -10639,8 +10643,19 @@ class InstructionsCompatibilityAnalysis {
}
}
}
- if (MainOp)
+ if (MainOp) {
+ // Do not match, if any copyable is a terminator from the same block as
+ // the main operation.
+ if (any_of(VL, [&](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && I->getParent() == MainOp->getParent() &&
+ I->isTerminator();
+ })) {
+ MainOp = nullptr;
+ return;
+ }
MainOpcode = MainOp->getOpcode();
+ }
}
/// Returns the idempotent value for the \p MainOp with the detected \p
@@ -11013,7 +11028,10 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
}
SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
if (all_of(VL, [&](Value *V) {
- return isa<PoisonValue>(V) || Values.contains(V);
+ return isa<PoisonValue>(V) || Values.contains(V) ||
+ (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
+ LI->getLoopFor(S.getMainOp()->getParent()) &&
+ isVectorized(V));
})) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
@@ -17835,6 +17853,17 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
}
+ Value *getVectorizedValue(const TreeEntry &E) {
+ Value *Vec = E.VectorizedValue;
+ if (!Vec->getType()->isIntOrIntVectorTy())
+ return Vec;
+ return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
+ return !isa<PoisonValue>(V) &&
+ !isKnownNonNegative(
+ V, SimplifyQuery(*R.DL));
+ }));
+ }
+
public:
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
: BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
@@ -18001,35 +18030,14 @@ public:
/// Adds 2 input vectors (in form of tree entries) and the mask for their
/// shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
- Value *V1 = E1.VectorizedValue;
- if (V1->getType()->isIntOrIntVectorTy())
- V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return false;
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
- Value *V2 = E2.VectorizedValue;
- if (V2->getType()->isIntOrIntVectorTy())
- V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return false;
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
+ Value *V1 = getVectorizedValue(E1);
+ Value *V2 = getVectorizedValue(E2);
add(V1, V2, Mask);
}
/// Adds single input vector (in form of tree entry) and the mask for its
/// shuffling.
void add(const TreeEntry &E1, ArrayRef<int> Mask) {
- Value *V1 = E1.VectorizedValue;
- if (V1->getType()->isIntOrIntVectorTy())
- V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return false;
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
+ Value *V1 = getVectorizedValue(E1);
add(V1, Mask);
}
/// Adds 2 input vectors and the mask for their shuffling.
@@ -18178,14 +18186,7 @@ public:
auto CreateSubVectors = [&](Value *Vec,
SmallVectorImpl<int> &CommonMask) {
for (auto [E, Idx] : SubVectors) {
- Value *V = E->VectorizedValue;
- if (V->getType()->isIntOrIntVectorTy())
- V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return false;
- return !isKnownNonNegative(
- V, SimplifyQuery(*R.DL));
- }));
+ Value *V = getVectorizedValue(*E);
unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
// Use scalar version of the SCalarType to correctly handle shuffles
// for revectorization. The revectorization mode operates by the
@@ -19526,11 +19527,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return cast<LoadInst>(V)->getPointerOperand();
});
OrdersType Order;
- std::optional<Value *> Stride =
- calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
- &*Builder.GetInsertPoint());
+ const SCEV *StrideSCEV =
+ calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
+ assert(StrideSCEV && "At this point stride should be known");
+ SCEVExpander Expander(*SE, *DL, "strided-load-vec");
+ Value *Stride = Expander.expandCodeFor(
+ StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint());
Value *NewStride =
- Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
+ Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
StrideVal = Builder.CreateMul(
NewStride,
ConstantInt::get(
@@ -20519,7 +20523,9 @@ Value *BoUpSLP::vectorizeTree(
!(GatheredLoadsEntriesFirst.has_value() &&
IE->Idx >= *GatheredLoadsEntriesFirst &&
VectorizableTree.front()->isGather() &&
- is_contained(VectorizableTree.front()->Scalars, I)))
+ is_contained(VectorizableTree.front()->Scalars, I)) &&
+ !(!VectorizableTree.front()->isGather() &&
+ VectorizableTree.front()->isCopyableElement(I)))
continue;
SmallVector<SelectInst *> LogicalOpSelects;
I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
@@ -20782,6 +20788,14 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
continue;
}
auto *SD = cast<ScheduleData>(SE);
+ if (SD->hasValidDependencies() &&
+ (!S.areInstructionsWithCopyableElements() ||
+ !S.isCopyableElement(SD->getInst())) &&
+ !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
+ EI.UserTE->hasState() &&
+ (!EI.UserTE->hasCopyableElements() ||
+ !EI.UserTE->isCopyableElement(SD->getInst())))
+ SD->clearDirectDependencies();
for (const Use &U : SD->getInst()->operands()) {
unsigned &NumOps =
UserOpToNumOps
@@ -20791,7 +20805,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
if (auto *Op = dyn_cast<Instruction>(U.get());
Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
*SLP, NumOps)) {
- if (ScheduleData *OpSD = getScheduleData(Op)) {
+ if (ScheduleData *OpSD = getScheduleData(Op);
+ OpSD && OpSD->hasValidDependencies()) {
OpSD->clearDirectDependencies();
if (RegionHasStackSave ||
!isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
@@ -20977,7 +20992,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ScheduleCopyableDataMapByUsers.erase(I);
ScheduleCopyableDataMap.erase(KV);
// Need to recalculate dependencies for the actual schedule data.
- if (ScheduleData *OpSD = getScheduleData(I)) {
+ if (ScheduleData *OpSD = getScheduleData(I);
+ OpSD && OpSD->hasValidDependencies()) {
OpSD->clearDirectDependencies();
if (RegionHasStackSave ||
!isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
@@ -21881,6 +21897,10 @@ bool BoUpSLP::collectValuesToDemote(
return TryProcessInstruction(BitWidth);
case Instruction::ZExt:
case Instruction::SExt:
+ if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
+ E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
+ E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
+ return false;
IsProfitableToDemote = true;
return TryProcessInstruction(BitWidth);
@@ -23797,9 +23817,7 @@ public:
size_t Key, Idx;
std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
/*AllowAlternate=*/false);
- ++PossibleReducedVals[Key][Idx]
- .insert(std::make_pair(V, 0))
- .first->second;
+ ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
}
for (Instruction *I : reverse(PossibleReductionOps))
Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
@@ -23820,21 +23838,20 @@ public:
stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
return P1.size() > P2.size();
});
- int NewIdx = -1;
+ bool First = true;
for (ArrayRef<Value *> Data : PossibleRedValsVect) {
- if (NewIdx < 0 ||
- (!isGoodForReduction(Data) &&
- (!isa<LoadInst>(Data.front()) ||
- !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
- getUnderlyingObject(
- cast<LoadInst>(Data.front())->getPointerOperand()) !=
- getUnderlyingObject(
- cast<LoadInst>(ReducedVals[NewIdx].front())
- ->getPointerOperand())))) {
- NewIdx = ReducedVals.size();
+ if (First) {
+ First = false;
ReducedVals.emplace_back();
+ } else if (!isGoodForReduction(Data)) {
+ auto *LI = dyn_cast<LoadInst>(Data.front());
+ auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
+ if (!LI || !LastLI ||
+ getUnderlyingObject(LI->getPointerOperand()) !=
+ getUnderlyingObject(LastLI->getPointerOperand()))
+ ReducedVals.emplace_back();
}
- ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
+ ReducedVals.back().append(Data.rbegin(), Data.rend());
}
}
// Sort the reduced values by number of same/alternate opcode and/or pointer
@@ -23847,7 +23864,8 @@ public:
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
- const TargetLibraryInfo &TLI, AssumptionCache *AC) {
+ const TargetLibraryInfo &TLI, AssumptionCache *AC,
+ DominatorTree &DT) {
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
@@ -24164,9 +24182,7 @@ public:
// previous vectorization attempts.
if (any_of(VL, [&V](Value *RedVal) {
auto *RedValI = dyn_cast<Instruction>(RedVal);
- if (!RedValI)
- return false;
- return V.isDeleted(RedValI);
+ return RedValI && V.isDeleted(RedValI);
}))
break;
V.buildTree(VL, IgnoreList);
@@ -24248,7 +24264,7 @@ public:
// Estimate cost.
InstructionCost ReductionCost =
- getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
+ getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
<< " for reduction\n");
@@ -24553,7 +24569,9 @@ private:
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
bool IsCmpSelMinMax, FastMathFlags FMF,
- const BoUpSLP &R) {
+ const BoUpSLP &R, DominatorTree &DT,
+ const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *ScalarTy = ReducedVals.front()->getType();
unsigned ReduxWidth = ReducedVals.size();
@@ -24578,6 +24596,22 @@ private:
for (User *U : RdxVal->users()) {
auto *RdxOp = cast<Instruction>(U);
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
+ if (RdxKind == RecurKind::FAdd) {
+ InstructionCost FMACost = canConvertToFMA(
+ RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
+ if (FMACost.isValid()) {
+ LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
+ if (auto *I = dyn_cast<Instruction>(RdxVal)) {
+ // Also, exclude scalar fmul cost.
+ InstructionCost FMulCost =
+ TTI->getInstructionCost(I, CostKind);
+ LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
+ FMACost -= FMulCost;
+ }
+ ScalarCost += FMACost;
+ continue;
+ }
+ }
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
continue;
}
@@ -24642,8 +24676,45 @@ private:
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
std::make_pair(RedTy, true));
VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
- VectorCost +=
- TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
+ InstructionCost FMACost = InstructionCost::getInvalid();
+ if (RdxKind == RecurKind::FAdd) {
+ // Check if the reduction operands can be converted to FMA.
+ SmallVector<Value *> Ops;
+ FastMathFlags FMF;
+ FMF.set();
+ for (Value *RdxVal : ReducedVals) {
+ if (!RdxVal->hasOneUse()) {
+ Ops.clear();
+ break;
+ }
+ if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
+ FMF &= FPCI->getFastMathFlags();
+ Ops.push_back(RdxVal->user_back());
+ }
+ if (!Ops.empty()) {
+ FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
+ *TTI, TLI);
+ if (FMACost.isValid()) {
+ // Calculate actual FMAD cost.
+ IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
+ {RVecTy, RVecTy, RVecTy}, FMF);
+ FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
+
+ LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
+ // Also, exclude vector fmul cost.
+ InstructionCost FMulCost = TTI->getArithmeticInstrCost(
+ Instruction::FMul, RVecTy, CostKind);
+ LLVM_DEBUG(dbgs()
+ << "Minus vector FMul cost: " << FMulCost << "\n");
+ FMACost -= FMulCost;
+ }
+ }
+ }
+ if (FMACost.isValid())
+ VectorCost += FMACost;
+ else
+ VectorCost +=
+ TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
if (RType != RedTy) {
unsigned Opcode = Instruction::Trunc;
if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
@@ -25311,7 +25382,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
HorizontalReduction HorRdx;
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
- return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
+ return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
};
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25456,7 +25527,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (RedCost >= ScalarCost)
return false;
- return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+ return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
};
if (Candidates.size() == 1)
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
@@ -25540,7 +25611,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
template <typename T>
static bool tryToVectorizeSequence(
SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
- function_ref<bool(T *, T *)> AreCompatible,
+ function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
bool MaxVFOnly, BoUpSLP &R) {
bool Changed = false;
@@ -25562,7 +25633,7 @@ static bool tryToVectorizeSequence(
auto *SameTypeIt = IncIt;
while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
- AreCompatible(*SameTypeIt, *IncIt))) {
+ AreCompatible(VL, *SameTypeIt))) {
auto *I = dyn_cast<Instruction>(*SameTypeIt);
++SameTypeIt;
if (I && !R.isDeleted(I))
@@ -25760,10 +25831,10 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
return compareCmp<false>(V, V2, *TLI, *DT);
};
- auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
- if (V1 == V2)
+ auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
+ if (VL.empty() || VL.back() == V1)
return true;
- return compareCmp<true>(V1, V2, *TLI, *DT);
+ return compareCmp<true>(V1, VL.back(), *TLI, *DT);
};
SmallVector<Value *> Vals;
@@ -25969,9 +26040,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
return false;
};
- auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
- if (V1 == V2)
+ auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
+ Value *V1) {
+ if (VL.empty() || V1 == VL.back())
return true;
+ Value *V2 = VL.back();
if (V1->getType() != V2->getType())
return false;
ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
@@ -26061,7 +26134,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
InstSetVector PostProcessInserts;
SmallSetVector<CmpInst *, 8> PostProcessCmps;
- // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
+ // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
// also vectorizes `PostProcessCmps`.
auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
@@ -26342,7 +26415,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
V2->getValueOperand()->getValueID();
};
- auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
+ bool SameParent = true;
+ auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
+ if (VL.empty()) {
+ SameParent = true;
+ return true;
+ }
+ StoreInst *V2 = VL.back();
if (V1 == V2)
return true;
if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
@@ -26353,15 +26432,34 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (isa<UndefValue>(V1->getValueOperand()) ||
isa<UndefValue>(V2->getValueOperand()))
return true;
- if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
- if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
- if (I1->getParent() != I2->getParent())
- return false;
- return getSameOpcode({I1, I2}, *TLI).valid();
- }
if (isa<Constant>(V1->getValueOperand()) &&
isa<Constant>(V2->getValueOperand()))
return true;
+ // Check if the operands of the stores can be vectorized. They can be
+ // vectorized, if they have compatible operands or have operands, which can
+ // be vectorized as copyables.
+ auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
+ auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
+ if (I1 || I2) {
+ // Accept only tail-following non-compatible values for now.
+ // TODO: investigate if it is possible to vectorize incompatible values,
+ // if the copyables are first in the list.
+ if (I1 && !I2)
+ return false;
+ SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
+ SmallVector<Value *> NewVL(VL.size() + 1);
+ for (auto [SI, V] : zip(VL, NewVL))
+ V = SI->getValueOperand();
+ NewVL.back() = V1->getValueOperand();
+ InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+ InstructionsState S = Analysis.buildInstructionsState(
+ NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
+ /*SkipSameCodeCheck=*/!SameParent);
+ if (S)
+ return true;
+ if (!SameParent)
+ return false;
+ }
return V1->getValueOperand()->getValueID() ==
V2->getValueOperand()->getValueID();
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f972efa07eb7..16b1b539345d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -45,6 +45,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <cassert>
#include <string>
@@ -55,6 +56,15 @@ namespace llvm {
extern cl::opt<bool> EnableVPlanNativePath;
}
+/// @{
+/// Metadata attribute names
+const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
+const char LLVMLoopVectorizeFollowupVectorized[] =
+ "llvm.loop.vectorize.followup_vectorized";
+const char LLVMLoopVectorizeFollowupEpilogue[] =
+ "llvm.loop.vectorize.followup_epilogue";
+/// @}
+
extern cl::opt<unsigned> ForceTargetInstructionCost;
static cl::opt<bool> PrintVPlansInDotFormat(
@@ -143,7 +153,7 @@ template <typename T> static T *getPlanEntry(T *Start) {
for (unsigned i = 0; i < WorkList.size(); i++) {
T *Current = WorkList[i];
- if (Current->getNumPredecessors() == 0)
+ if (!Current->hasPredecessors())
return Current;
auto &Predecessors = Current->getPredecessors();
WorkList.insert_range(Predecessors);
@@ -216,7 +226,7 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
// If VPBB is in a region R, VPBB is a loop header if R is a loop region with
// VPBB as its entry, i.e., free of predecessors.
if (auto *R = VPBB->getParent())
- return !R->isReplicator() && VPBB->getNumPredecessors() == 0;
+ return !R->isReplicator() && !VPBB->hasPredecessors();
// A header dominates its second predecessor (the latch), with the other
// predecessor being the preheader
@@ -493,6 +503,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
void VPIRBasicBlock::execute(VPTransformState *State) {
assert(getHierarchicalSuccessors().size() <= 2 &&
"VPIRBasicBlock can have at most two successors at the moment!");
+ // Move completely disconnected blocks to their final position.
+ if (IRBB->hasNPredecessors(0) && succ_begin(IRBB) == succ_end(IRBB))
+ IRBB->moveAfter(State->CFG.PrevBB);
State->Builder.SetInsertPoint(IRBB->getTerminator());
State->CFG.PrevBB = IRBB;
State->CFG.VPBB2IRBB[this] = IRBB;
@@ -809,7 +822,7 @@ InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
const VPBlockBase *Pred = nullptr;
- if (getNumPredecessors() > 0) {
+ if (hasPredecessors()) {
Pred = getPredecessors()[Idx];
} else {
auto *Region = getParent();
@@ -1183,14 +1196,14 @@ VPlan *VPlan::duplicate() {
BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
VPIRBasicBlock *NewScalarHeader = nullptr;
- if (getScalarHeader()->getNumPredecessors() == 0) {
- NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB);
- } else {
+ if (getScalarHeader()->hasPredecessors()) {
NewScalarHeader = cast<VPIRBasicBlock>(*find_if(
vp_depth_first_shallow(NewEntry), [ScalarHeaderIRBB](VPBlockBase *VPB) {
auto *VPIRBB = dyn_cast<VPIRBasicBlock>(VPB);
return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
}));
+ } else {
+ NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB);
}
// Create VPlan, clone live-ins and remap operands in the cloned blocks.
auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader);
@@ -1473,7 +1486,7 @@ void VPSlotTracker::assignName(const VPValue *V) {
std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str();
// First assign the base name for V.
- const auto &[A, _] = VPValue2Name.insert({V, BaseName});
+ const auto &[A, _] = VPValue2Name.try_emplace(V, BaseName);
// Integer or FP constants with different types will result in he same string
// due to stripping types.
if (V->isLiveIn() && isa<ConstantInt, ConstantFP>(UV))
@@ -1481,7 +1494,7 @@ void VPSlotTracker::assignName(const VPValue *V) {
// If it is already used by C > 0 other VPValues, increase the version counter
// C and use it for V.
- const auto &[C, UseInserted] = BaseName2Version.insert({BaseName, 0});
+ const auto &[C, UseInserted] = BaseName2Version.try_emplace(BaseName, 0);
if (!UseInserted) {
C->second++;
A->second = (BaseName + Twine(".") + Twine(C->second)).str();
@@ -1612,6 +1625,123 @@ VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
llvm_unreachable("No plan found!");
}
+static void addRuntimeUnrollDisableMetaData(Loop *L) {
+ SmallVector<Metadata *, 4> MDs;
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDs.push_back(nullptr);
+ bool IsUnrollMetadata = false;
+ MDNode *LoopID = L->getLoopID();
+ if (LoopID) {
+ // First find existing loop unrolling disable metadata.
+ for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
+ auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
+ if (MD) {
+ const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+ if (!S)
+ continue;
+ if (S->getString().starts_with("llvm.loop.unroll.runtime.disable"))
+ continue;
+ IsUnrollMetadata =
+ S->getString().starts_with("llvm.loop.unroll.disable");
+ }
+ MDs.push_back(LoopID->getOperand(I));
+ }
+ }
+
+ if (!IsUnrollMetadata) {
+ // Add runtime unroll disable metadata.
+ LLVMContext &Context = L->getHeader()->getContext();
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(
+ MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+ MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+ MDs.push_back(DisableNode);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L->setLoopID(NewLoopID);
+ }
+}
+
+void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo(
+ Loop *VectorLoop, VPBasicBlock *HeaderVPBB, bool VectorizingEpilogue,
+ unsigned EstimatedVFxUF, bool DisableRuntimeUnroll) {
+ MDNode *LID = OrigLoop->getLoopID();
+ // Update the metadata of the scalar loop. Skip the update when vectorizing
+ // the epilogue loop, to ensure it is only updated once.
+ if (!VectorizingEpilogue) {
+ std::optional<MDNode *> RemainderLoopID = makeFollowupLoopID(
+ LID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue});
+ if (RemainderLoopID) {
+ OrigLoop->setLoopID(*RemainderLoopID);
+ } else {
+ if (DisableRuntimeUnroll)
+ addRuntimeUnrollDisableMetaData(OrigLoop);
+
+ LoopVectorizeHints Hints(OrigLoop, true, *ORE);
+ Hints.setAlreadyVectorized();
+ }
+ }
+
+ if (!VectorLoop)
+ return;
+
+ if (std::optional<MDNode *> VectorizedLoopID =
+ makeFollowupLoopID(LID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupVectorized})) {
+ VectorLoop->setLoopID(*VectorizedLoopID);
+ } else {
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ if (LID)
+ VectorLoop->setLoopID(LID);
+
+ if (!VectorizingEpilogue) {
+ LoopVectorizeHints Hints(VectorLoop, true, *ORE);
+ Hints.setAlreadyVectorized();
+ }
+
+ // Check if it's EVL-vectorized and mark the corresponding metadata.
+ bool IsEVLVectorized =
+ llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) {
+ // Looking for the ExplictVectorLength VPInstruction.
+ if (const auto *VI = dyn_cast<VPInstruction>(&Recipe))
+ return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
+ return false;
+ });
+ if (IsEVLVectorized) {
+ LLVMContext &Context = VectorLoop->getHeader()->getContext();
+ MDNode *LoopID = VectorLoop->getLoopID();
+ auto *IsEVLVectorizedMD = MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"),
+ MDString::get(Context, "evl")});
+ MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {},
+ {IsEVLVectorizedMD});
+ VectorLoop->setLoopID(NewLoopID);
+ }
+ }
+ TargetTransformInfo::UnrollingPreferences UP;
+ TTI.getUnrollingPreferences(VectorLoop, *PSE.getSE(), UP, ORE);
+ if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
+ addRuntimeUnrollDisableMetaData(VectorLoop);
+
+ // Set/update profile weights for the vector and remainder loops as original
+ // loop iterations are now distributed among them. Note that original loop
+ // becomes the scalar remainder loop after vectorization.
+ //
+ // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
+ // end up getting slightly roughened result but that should be OK since
+ // profile is not inherently precise anyway. Note also possible bypass of
+ // vector code caused by legality checks is ignored, assigning all the weight
+ // to the vector loop, optimistically.
+ //
+ // For scalable vectorization we can't know at compile time how many
+ // iterations of the loop are handled in one vector iteration, so instead
+ // use the value of vscale used for tuning.
+ setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
if (VPlans.empty()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d6bc462a0dfa..53291a931530 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -219,6 +219,9 @@ public:
size_t getNumSuccessors() const { return Successors.size(); }
size_t getNumPredecessors() const { return Predecessors.size(); }
+ /// Returns true if this block has any predecessors.
+ bool hasPredecessors() const { return !Predecessors.empty(); }
+
/// An Enclosing Block of a block B is any block containing B, including B
/// itself. \return the closest enclosing block starting from "this", which
/// has successors. \return the root enclosing block if all enclosing blocks
@@ -400,7 +403,7 @@ class LLVM_ABI_FOR_TEST VPRecipeBase
public:
VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPDef(SC), VPUser(Operands), DL(DL) {}
virtual ~VPRecipeBase() = default;
@@ -518,11 +521,11 @@ protected:
class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
public:
VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeBase(SC, Operands, DL), VPValue(this) {}
VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
- Value *UV, DebugLoc DL = {})
+ Value *UV, DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {}
static inline bool classof(const VPRecipeBase *R) {
@@ -557,6 +560,7 @@ public:
case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
+ case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
case VPRecipeBase::VPWidenLoadEVLSC:
@@ -712,12 +716,15 @@ public:
VPIRFlags(GEPNoWrapFlags GEPFlags)
: OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {}
-public:
void transferFlags(VPIRFlags &Other) {
OpType = Other.OpType;
AllFlags = Other.AllFlags;
}
+ /// Only keep flags also present in \p Other. \p Other must have the same
+ /// OpType as the current object.
+ void intersectFlags(const VPIRFlags &Other);
+
/// Drop all poison-generating flags.
void dropPoisonGeneratingFlags() {
// NOTE: This needs to be kept in-sync with
@@ -864,7 +871,7 @@ public:
/// using IR flags.
struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPSingleDefRecipe(SC, Operands, DL), VPIRFlags() {}
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
@@ -872,7 +879,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
: VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()), VPIRFlags(I) {}
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
- const VPIRFlags &Flags, DebugLoc DL = {})
+ const VPIRFlags &Flags,
+ DebugLoc DL = DebugLoc::getUnknown())
: VPSingleDefRecipe(SC, Operands, DL), VPIRFlags(Flags) {}
static inline bool classof(const VPRecipeBase *R) {
@@ -900,6 +908,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
return R && classof(R);
}
+ static inline bool classof(const VPSingleDefRecipe *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
+
void execute(VPTransformState &State) override = 0;
/// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.
@@ -975,6 +988,10 @@ public:
Not,
SLPLoad,
SLPStore,
+ // Creates a mask where each lane is active (true) whilst the current
+ // counter (first operand + index) is less than the second operand. i.e.
+ // mask[i] = icmpt ult (op0 + i), op1
+ // The size of the mask returned is VF * Multiplier (UF, third op).
ActiveLaneMask,
ExplicitVectorLength,
CalculateTripCountMinusVF,
@@ -1014,7 +1031,8 @@ public:
// Returns a scalar boolean value, which is true if any lane of its
// (boolean) vector operands is true. It produces the reduced value across
// all unrolled iterations. Unrolling will add all copies of its original
- // operand as additional operands.
+ // operand as additional operands. AnyOf is poison-safe as all operands
+ // will be frozen.
AnyOf,
// Calculates the first active lane index of the vector predicate operands.
// It produces the lane index across all unrolled iterations. Unrolling will
@@ -1080,13 +1098,13 @@ private:
#endif
public:
- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
- const Twine &Name = "")
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
: VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
- const VPIRFlags &Flags, DebugLoc DL = {},
+ const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "");
VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
@@ -1479,7 +1497,8 @@ public:
}
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
- const VPIRFlags &Flags = {}, DebugLoc DL = {})
+ const VPIRFlags &Flags = {},
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, Flags, DL),
VPIRMetadata(), Opcode(Opcode), ResultTy(ResultTy) {
assert(flagsValidForOpcode(Opcode) &&
@@ -1537,7 +1556,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
public:
VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
VPIRMetadata(CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
MayReadFromMemory(CI.mayReadFromMemory()),
@@ -1546,7 +1565,7 @@ public:
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
VPIRMetadata(), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
LLVMContext &Ctx = Ty->getContext();
@@ -1615,7 +1634,8 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
public:
VPWidenCallRecipe(Value *UV, Function *Variant,
- ArrayRef<VPValue *> CallArguments, DebugLoc DL = {})
+ ArrayRef<VPValue *> CallArguments,
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
*cast<Instruction>(UV)),
VPIRMetadata(*cast<Instruction>(UV)), Variant(Variant) {
@@ -1644,10 +1664,8 @@ public:
return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
}
- operand_range args() { return make_range(op_begin(), std::prev(op_end())); }
- const_operand_range args() const {
- return make_range(op_begin(), std::prev(op_end()));
- }
+ operand_range args() { return drop_end(operands()); }
+ const_operand_range args() const { return drop_end(operands()); }
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
@@ -1667,7 +1685,7 @@ class VPHistogramRecipe : public VPRecipeBase {
public:
VPHistogramRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {}
~VPHistogramRecipe() override = default;
@@ -1998,6 +2016,9 @@ public:
return getOperand(1);
}
+ /// Update the incoming value from the loop backedge.
+ void setBackedgeValue(VPValue *V) { setOperand(1, V); }
+
/// Returns the backedge value as a recipe. The backedge value is guaranteed
/// to be a recipe.
virtual VPRecipeBase &getBackedgeRecipe() {
@@ -2229,8 +2250,8 @@ protected:
public:
/// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and
/// debug location \p DL.
- VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = {},
- const Twine &Name = "")
+ VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr,
+ DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
: VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL),
Name(Name.str()) {
if (Start)
@@ -2381,9 +2402,8 @@ public:
}
VPBlendRecipe *clone() override {
- SmallVector<VPValue *> Ops(operands());
- return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), Ops,
- getDebugLoc());
+ return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()),
+ operands(), getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPBlendSC)
@@ -2409,6 +2429,12 @@ public:
return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized());
}
+ /// Set mask number \p Idx to \p V.
+ void setMask(unsigned Idx, VPValue *V) {
+ assert((Idx > 0 || !isNormalized()) && "First index has no mask!");
+ Idx == 0 ? setOperand(1, V) : setOperand(Idx * 2 + !isNormalized(), V);
+ }
+
void execute(VPTransformState &State) override {
llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends");
}
@@ -2434,12 +2460,13 @@ public:
}
};
-/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
-/// or stores into one wide load/store and shuffles. The first operand of a
-/// VPInterleave recipe is the address, followed by the stored values, followed
-/// by an optional mask.
-class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
- public VPIRMetadata {
+/// A common base class for interleaved memory operations.
+/// An Interleaved memory operation is a memory access method that combines
+/// multiple strided loads/stores into a single wide load/store with shuffles.
+/// The first operand is the start address. The optional operands are, in order,
+/// the stored values and the mask.
+class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
+ public VPIRMetadata {
const InterleaveGroup<Instruction> *IG;
/// Indicates if the interleave group is in a conditional block and requires a
@@ -2450,12 +2477,14 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
/// unusued gaps can be loaded speculatively.
bool NeedsMaskForGaps = false;
-public:
- VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
- ArrayRef<VPValue *> StoredValues, VPValue *Mask,
- bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
- : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, DL), VPIRMetadata(MD),
- IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) {
+protected:
+ VPInterleaveBase(const unsigned char SC,
+ const InterleaveGroup<Instruction> *IG,
+ ArrayRef<VPValue *> Operands,
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+ bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
+ : VPRecipeBase(SC, Operands, DL), VPIRMetadata(MD), IG(IG),
+ NeedsMaskForGaps(NeedsMaskForGaps) {
// TODO: extend the masked interleaved-group support to reversed access.
assert((!Mask || !IG->isReverse()) &&
"Reversed masked interleave-group not supported.");
@@ -2473,14 +2502,19 @@ public:
addOperand(Mask);
}
}
- ~VPInterleaveRecipe() override = default;
- VPInterleaveRecipe *clone() override {
- return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
- NeedsMaskForGaps, *this, getDebugLoc());
+public:
+ VPInterleaveBase *clone() override = 0;
+
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
+ R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC;
}
- VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
/// Return the address accessed by this recipe.
VPValue *getAddr() const {
@@ -2490,48 +2524,130 @@ public:
/// Return the mask used by this recipe. Note that a full mask is represented
/// by a nullptr.
VPValue *getMask() const {
- // Mask is optional and therefore the last, currently 2nd operand.
+ // Mask is optional and the last operand.
return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
}
+ /// Return true if the access needs a mask because of the gaps.
+ bool needsMaskForGaps() const { return NeedsMaskForGaps; }
+
+ const InterleaveGroup<Instruction> *getInterleaveGroup() const { return IG; }
+
+ Instruction *getInsertPos() const { return IG->getInsertPos(); }
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("VPInterleaveBase should not be instantiated.");
+ }
+
+ /// Return the cost of this recipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ virtual bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+
+ /// Returns the number of stored operands of this interleave group. Returns 0
+ /// for load interleave groups.
+ virtual unsigned getNumStoreOperands() const = 0;
+
/// Return the VPValues stored by this interleave group. If it is a load
/// interleave group, return an empty ArrayRef.
ArrayRef<VPValue *> getStoredValues() const {
- // The first operand is the address, followed by the stored values, followed
- // by an optional mask.
- return ArrayRef<VPValue *>(op_begin(), getNumOperands())
- .slice(1, getNumStoreOperands());
+ return ArrayRef<VPValue *>(op_end() -
+ (getNumStoreOperands() + (HasMask ? 1 : 0)),
+ getNumStoreOperands());
+ }
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
+/// or stores into one wide load/store and shuffles. The first operand of a
+/// VPInterleave recipe is the address, followed by the stored values, followed
+/// by an optional mask.
+class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
+public:
+ VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
+ ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+ bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
+ : VPInterleaveBase(VPDef::VPInterleaveSC, IG, Addr, StoredValues, Mask,
+ NeedsMaskForGaps, MD, DL) {}
+
+ ~VPInterleaveRecipe() override = default;
+
+ VPInterleaveRecipe *clone() override {
+ return new VPInterleaveRecipe(getInterleaveGroup(), getAddr(),
+ getStoredValues(), getMask(),
+ needsMaskForGaps(), *this, getDebugLoc());
}
+ VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+
/// Generate the wide load or store, and shuffles.
void execute(VPTransformState &State) override;
- /// Return the cost of this VPInterleaveRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
- const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+ }
- /// Returns the number of stored operands of this interleave group. Returns 0
- /// for load interleave groups.
- unsigned getNumStoreOperands() const {
- return getNumOperands() - (HasMask ? 2 : 1);
+ unsigned getNumStoreOperands() const override {
+ return getNumOperands() - (getMask() ? 2 : 1);
}
+};
+
+/// A recipe for interleaved memory operations with vector-predication
+/// intrinsics. The first operand is the address, the second operand is the
+/// explicit vector length. Stored values and mask are optional operands.
+class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
+public:
+ VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask)
+ : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(),
+ ArrayRef<VPValue *>({R.getAddr(), &EVL}),
+ R.getStoredValues(), Mask, R.needsMaskForGaps(), R,
+ R.getDebugLoc()) {
+ assert(!getInterleaveGroup()->isReverse() &&
+ "Reversed interleave-group with tail folding is not supported.");
+ assert(!needsMaskForGaps() && "Interleaved access with gap mask is not "
+ "supported for scalable vector.");
+ }
+
+ ~VPInterleaveEVLRecipe() override = default;
+
+ VPInterleaveEVLRecipe *clone() override {
+ llvm_unreachable("cloning not implemented yet");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC)
+
+ /// The VPValue of the explicit vector length.
+ VPValue *getEVL() const { return getOperand(1); }
- /// The recipe only uses the first lane of the address.
+ /// Generate the wide load or store, and shuffles.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// The recipe only uses the first lane of the address, and EVL operand.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+ return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
+ Op == getEVL();
}
- Instruction *getInsertPos() const { return IG->getInsertPos(); }
+ unsigned getNumStoreOperands() const override {
+ return getNumOperands() - (getMask() ? 3 : 2);
+ }
};
/// A recipe to represent inloop reduction operations, performing a reduction on
@@ -2561,14 +2677,14 @@ protected:
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL = {})
+ bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
IsOrdered, DL) {}
VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL = {})
+ bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
IsOrdered, DL) {}
@@ -2686,7 +2802,7 @@ public:
class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
public:
VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp,
- DebugLoc DL = {})
+ DebugLoc DL = DebugLoc::getUnknown())
: VPReductionRecipe(
VPDef::VPReductionEVLSC, R.getRecurrenceKind(),
R.getFastMathFlags(),
@@ -3537,7 +3653,8 @@ public:
InductionOpcode(Opcode) {}
VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
- VPValue *Step, VPValue *VF, DebugLoc DL = {})
+ VPValue *Step, VPValue *VF,
+ DebugLoc DL = DebugLoc::getUnknown())
: VPScalarIVStepsRecipe(
IV, Step, VF, IndDesc.getInductionOpcode(),
dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())
@@ -4142,7 +4259,7 @@ public:
/// Returns an iterator range over all VFs of the plan.
iterator_range<SmallSetVector<ElementCount, 2>::iterator>
vectorFactors() const {
- return {VFs.begin(), VFs.end()};
+ return VFs;
}
bool hasScalarVFOnly() const {
@@ -4299,9 +4416,8 @@ public:
/// via the other early exit).
bool hasEarlyExit() const {
return count_if(ExitBlocks,
- [](VPIRBasicBlock *EB) {
- return EB->getNumPredecessors() != 0;
- }) > 1 ||
+ [](VPIRBasicBlock *EB) { return EB->hasPredecessors(); }) >
+ 1 ||
(ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1);
}
@@ -4309,7 +4425,7 @@ public:
/// that this relies on unneeded branches to the scalar tail loop being
/// removed.
bool hasScalarTail() const {
- return !(getScalarPreheader()->getNumPredecessors() == 0 ||
+ return !(!getScalarPreheader()->hasPredecessors() ||
getScalarPreheader()->getSinglePredecessor() == getEntry());
}
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 747c6623aa22..d400ceff7797 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -296,7 +296,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
- .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
+ .Case<VPInterleaveBase>([V](const auto *R) {
// TODO: Use info from interleave group.
return V->getUnderlyingValue()->getType();
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 80b48de57b40..cef91c15dd87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -193,6 +193,9 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
}
if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
+ // Don't emit recipes for unconditional switch instructions.
+ if (SI->getNumCases() == 0)
+ continue;
SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
for (auto Case : SI->cases())
Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
@@ -538,8 +541,7 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
}
void VPlanTransforms::handleEarlyExits(VPlan &Plan,
- bool HasUncountableEarlyExit,
- VFRange &Range) {
+ bool HasUncountableEarlyExit) {
auto *MiddleVPBB = cast<VPBasicBlock>(
Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
@@ -559,8 +561,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
assert(!HandledUncountableEarlyExit &&
"can handle exactly one uncountable early exit");
handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
- cast<VPBasicBlock>(HeaderVPB), LatchVPBB,
- Range);
+ cast<VPBasicBlock>(HeaderVPB), LatchVPBB);
HandledUncountableEarlyExit = true;
} else {
for (VPRecipeBase &R : EB->phis())
@@ -671,6 +672,90 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
}
}
+void VPlanTransforms::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
+ bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
+ const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) {
+ // Generate code to check if the loop's trip count is less than VF * UF, or
+ // equal to it in case a scalar epilogue is required; this implies that the
+ // vector trip count is zero. This check also covers the case where adding one
+ // to the backedge-taken count overflowed leading to an incorrect trip count
+ // of zero. In this case we will also jump to the scalar loop.
+ CmpInst::Predicate CmpPred =
+ RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ // If tail is to be folded, vector loop takes care of all iterations.
+ VPValue *TripCountVPV = Plan.getTripCount();
+ const SCEV *TripCount = vputils::getSCEVExprForVPValue(TripCountVPV, SE);
+ Type *TripCountTy = TripCount->getType();
+ auto GetMinTripCount = [&]() -> const SCEV * {
+ // Compute max(MinProfitableTripCount, UF * VF) and return it.
+ const SCEV *VFxUF =
+ SE.getElementCount(TripCountTy, (VF * UF), SCEV::FlagNUW);
+ if (UF * VF.getKnownMinValue() >=
+ MinProfitableTripCount.getKnownMinValue()) {
+ // TODO: SCEV should be able to simplify test.
+ return VFxUF;
+ }
+ const SCEV *MinProfitableTripCountSCEV =
+ SE.getElementCount(TripCountTy, MinProfitableTripCount, SCEV::FlagNUW);
+ return SE.getUMaxExpr(MinProfitableTripCountSCEV, VFxUF);
+ };
+
+ VPBasicBlock *EntryVPBB = Plan.getEntry();
+ VPBuilder Builder(EntryVPBB);
+ VPValue *TripCountCheck = Plan.getFalse();
+ const SCEV *Step = GetMinTripCount();
+ if (TailFolded) {
+ if (CheckNeededWithTailFolding) {
+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
+ // an overflow to zero when updating induction variables and so an
+ // additional overflow check is required before entering the vector loop.
+
+ // Get the maximum unsigned value for the type.
+ VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
+ TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
+ VPValue *DistanceToMax = Builder.createNaryOp(
+ Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
+ DebugLoc::getUnknown());
+
+ // Don't execute the vector loop if (UMax - n) < (VF * UF).
+ // FIXME: Should only check VF * UF, but currently checks Step=max(VF*UF,
+ // minProfitableTripCount).
+ TripCountCheck = Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax,
+ Builder.createExpandSCEV(Step), DL);
+ } else {
+ // TripCountCheck = false, folding tail implies positive vector trip
+ // count.
+ }
+ } else {
+ // TODO: Emit unconditional branch to vector preheader instead of
+ // conditional branch with known condition.
+ TripCount = SE.applyLoopGuards(TripCount, OrigLoop);
+ // Check if the trip count is < the step.
+ if (SE.isKnownPredicate(CmpPred, TripCount, Step)) {
+ // TODO: Ensure step is at most the trip count when determining max VF and
+ // UF, w/o tail folding.
+ TripCountCheck = Plan.getTrue();
+ } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(CmpPred),
+ TripCount, Step)) {
+ // Generate the minimum iteration check only if we cannot prove the
+ // check is known to be true, or known to be false.
+ VPValue *MinTripCountVPV = Builder.createExpandSCEV(Step);
+ TripCountCheck = Builder.createICmp(
+ CmpPred, TripCountVPV, MinTripCountVPV, DL, "min.iters.check");
+ } // else step known to be < trip count, use TripCountCheck preset to false.
+ }
+ VPInstruction *Term =
+ Builder.createNaryOp(VPInstruction::BranchOnCond, {TripCountCheck}, DL);
+ if (MinItersBypassWeights) {
+ MDBuilder MDB(Plan.getContext());
+ MDNode *BranchWeights = MDB.createBranchWeights(
+ ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false);
+ Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+ }
+}
+
bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 1ec6ae677374..109156c1469c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -145,6 +145,16 @@ inline int_pred_ty<is_all_ones> m_AllOnes() {
return int_pred_ty<is_all_ones>();
}
+struct is_zero_int {
+ bool isValue(const APInt &C) const { return C.isZero(); }
+};
+
+/// Match an integer 0 or a vector with all elements equal to 0.
+/// For vectors, this includes constants with undefined elements.
+inline int_pred_ty<is_zero_int> m_ZeroInt() {
+ return int_pred_ty<is_zero_int>();
+}
+
/// Matching combinators
template <typename LTy, typename RTy> struct match_combine_or {
LTy L;
@@ -218,9 +228,12 @@ struct Recipe_match {
if ((!matchRecipeAndOpcode<RecipeTys>(R) && ...))
return false;
- assert(R->getNumOperands() == std::tuple_size<Ops_t>::value &&
- "recipe with matched opcode does not have the expected number of "
- "operands");
+ if (R->getNumOperands() != std::tuple_size<Ops_t>::value) {
+ assert(Opcode == Instruction::PHI &&
+ "non-variadic recipe with matched opcode does not have the "
+ "expected number of operands");
+ return false;
+ }
auto IdxSeq = std::make_index_sequence<std::tuple_size<Ops_t>::value>();
if (all_of_tuple_elements(IdxSeq, [R](auto Op, unsigned Idx) {
@@ -302,14 +315,21 @@ m_Broadcast(const Op0_t &Op0) {
}
template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::ExplicitVectorLength, Op0_t>
+m_EVL(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExplicitVectorLength>(Op0);
+}
+
+template <typename Op0_t>
inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t>
m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
}
-template <typename Op0_t, typename Op1_t>
-inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t>
-m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
- return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
+
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
+m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
+ return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
}
template <typename Op0_t, typename Op1_t>
@@ -345,6 +365,12 @@ m_ZExtOrSExt(const Op0_t &Op0) {
return m_CombineOr(m_ZExt(Op0), m_SExt(Op0));
}
+template <typename Op0_t>
+inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>, Op0_t>
+m_ZExtOrSelf(const Op0_t &Op0) {
+ return m_CombineOr(m_ZExt(Op0), Op0);
+}
+
template <unsigned Opcode, typename Op0_t, typename Op1_t>
inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0,
const Op1_t &Op1) {
@@ -381,6 +407,13 @@ m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
}
+/// Match a binary AND operation.
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_commutative_match<Instruction::And, Op0_t, Op1_t>
+m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_c_Binary<Instruction::And, Op0_t, Op1_t>(Op0, Op1);
+}
+
/// Match a binary OR operation. Note that while conceptually the operands can
/// be matched commutatively, \p Commutative defaults to false in line with the
/// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index cdadc33e3088..0c27d535b680 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,11 +14,13 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanCFG.h"
+#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
#include "llvm/ADT/PostOrderIterator.h"
using namespace llvm;
+using namespace VPlanPatternMatch;
namespace {
class VPPredicator {
@@ -246,6 +248,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
"Distinct incoming values with one having a full mask");
break;
}
+
OperandsWithMask.push_back(EdgeMask);
}
PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c4fdcccc6d62..bf5148954309 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -52,8 +52,9 @@ bool VPRecipeBase::mayWriteToMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
- return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
+ return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
case VPWidenStoreSC:
return true;
@@ -142,6 +143,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return false;
}
default:
+ // FIXME: Return false if the recipe represents an interleaved store.
return true;
}
}
@@ -183,6 +185,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
case VPWidenLoadEVLSC:
@@ -255,7 +258,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
Instruction *UI = nullptr;
if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
- else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
+ else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
UI = IG->getInsertPos();
else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
UI = &WidenMem->getIngredient();
@@ -389,6 +392,42 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
+ assert(OpType == Other.OpType && "OpType must match");
+ switch (OpType) {
+ case OperationType::OverflowingBinOp:
+ WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
+ WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
+ break;
+ case OperationType::Trunc:
+ TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
+ TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
+ break;
+ case OperationType::DisjointOp:
+ DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
+ break;
+ case OperationType::PossiblyExactOp:
+ ExactFlags.IsExact &= Other.ExactFlags.IsExact;
+ break;
+ case OperationType::GEPOp:
+ GEPFlags &= Other.GEPFlags;
+ break;
+ case OperationType::FPMathOp:
+ FMFs.NoNaNs &= Other.FMFs.NoNaNs;
+ FMFs.NoInfs &= Other.FMFs.NoInfs;
+ break;
+ case OperationType::NonNegOp:
+ NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
+ break;
+ case OperationType::Cmp:
+ assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
+ break;
+ case OperationType::Other:
+ assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
+ break;
+ }
+}
+
FastMathFlags VPIRFlags::getFastMathFlags() const {
assert(OpType == OperationType::FPMathOp &&
"recipe doesn't have fast math flags");
@@ -471,7 +510,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case Instruction::ICmp:
case Instruction::FCmp:
case Instruction::Store:
- case VPInstruction::ActiveLaneMask:
case VPInstruction::BranchOnCount:
case VPInstruction::ComputeReductionResult:
case VPInstruction::FirstOrderRecurrenceSplice:
@@ -481,6 +519,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::WideIVStep:
return 2;
case Instruction::Select:
+ case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
return 3;
@@ -620,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
Name);
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = VectorType::get(Int1Ty, State.VF);
+ auto PredTy = VectorType::get(
+ Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+ ->getZExtValue());
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
@@ -875,9 +916,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::AnyOf: {
- Value *Res = State.get(getOperand(0));
+ Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
for (VPValue *Op : drop_begin(operands()))
- Res = Builder.CreateOr(Res, State.get(Op));
+ Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
}
case VPInstruction::ExtractLane: {
@@ -919,8 +960,15 @@ Value *VPInstruction::generate(VPTransformState &State) {
unsigned LastOpIdx = getNumOperands() - 1;
Value *Res = nullptr;
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
- Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
- Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
+ Value *TrailingZeros =
+ State.VF.isScalar()
+ ? Builder.CreateZExt(
+ Builder.CreateICmpEQ(State.get(getOperand(Idx)),
+ Builder.getFalse()),
+ Builder.getInt64Ty())
+ : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
+ State.get(getOperand(Idx)),
+ true, Name);
Value *Current = Builder.CreateAdd(
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
if (Res) {
@@ -1027,8 +1075,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
switch (getOpcode()) {
+ case Instruction::Select: {
+ // TODO: It may be possible to improve this by analyzing where the
+ // condition operand comes from.
+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
+ auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
+ if (!vputils::onlyFirstLaneUsed(this)) {
+ CondTy = toVectorTy(CondTy, VF);
+ VecTy = toVectorTy(VecTy, VF);
+ }
+ return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
+ Ctx.CostKind);
+ }
case Instruction::ExtractElement:
case VPInstruction::ExtractLane: {
+ if (VF.isScalar()) {
+ // ExtractLane with VF=1 takes care of handling extracting across multiple
+ // parts.
+ return 0;
+ }
+
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -1040,8 +1107,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::FirstActiveLane: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
+ if (VF.isScalar())
+ return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy),
+ CmpInst::ICMP_EQ, Ctx.CostKind);
// Calculate the cost of determining the lane index.
- auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+ auto *PredTy = toVectorTy(ScalarTy, VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
Type::getInt64Ty(Ctx.LLVMCtx),
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
@@ -1060,7 +1132,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
case VPInstruction::ActiveLaneMask: {
Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
- Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ unsigned Multiplier =
+ cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
+ Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
{ArgTy, ArgTy});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
@@ -1684,18 +1758,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
State.set(this, V);
}
-InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
+static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
+ ArrayRef<const VPValue *> Operands,
+ const VPRecipeWithIRFlags &R,
+ ElementCount VF,
+ VPCostContext &Ctx) {
// Some backends analyze intrinsic arguments to determine cost. Use the
// underlying value for the operand if it has one. Otherwise try to use the
// operand of the underlying call instruction, if there is one. Otherwise
// clear Arguments.
// TODO: Rework TTI interface to be independent of concrete IR values.
SmallVector<const Value *> Arguments;
- for (const auto &[Idx, Op] : enumerate(operands())) {
+ for (const auto &[Idx, Op] : enumerate(Operands)) {
auto *V = Op->getUnderlyingValue();
if (!V) {
- if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
+ if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
Arguments.push_back(UI->getArgOperand(Idx));
continue;
}
@@ -1705,21 +1783,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
Arguments.push_back(V);
}
- Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
+ Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
+ Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
SmallVector<Type *> ParamTys;
- for (unsigned I = 0; I != getNumOperands(); ++I)
- ParamTys.push_back(
- toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
+ for (const VPValue *Op : Operands) {
+ ParamTys.push_back(VF.isVector()
+ ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
+ : Ctx.Types.inferScalarType(Op));
+ }
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
- FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
+ FastMathFlags FMF =
+ R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
IntrinsicCostAttributes CostAttrs(
- VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+ ID, RetTy, Arguments, ParamTys, FMF,
+ dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
InstructionCost::getInvalid(), &Ctx.TLI);
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
+InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ SmallVector<const VPValue *> ArgOps(operands());
+ return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
+}
+
StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
return Intrinsic::getBaseName(VectorIntrinsicID);
}
@@ -2110,8 +2198,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
- // More complex computation, let the legacy cost-model handle this for now.
- return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
+ // If the div/rem operation isn't safe to speculate and requires
+ // predication, then the only way we can even create a vplan is to insert
+ // a select on the second input operand to ensure we use the value of 1
+ // for the inactive lanes. The select will be costed separately.
case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
@@ -2174,7 +2264,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
if (VF.isScalar())
return TTI::CastContextHint::Normal;
- if (isa<VPInterleaveRecipe>(R))
+ if (isa<VPInterleaveBase>(R))
return TTI::CastContextHint::Interleave;
if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
@@ -2756,10 +2846,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
assert(RedTy->isIntegerTy() &&
"VPExpressionRecipe only supports integer types currently.");
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(
+ cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
- unsigned Opcode = RecurrenceDescriptor::getOpcode(
- cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
return Ctx.TTI.getExtendedReductionCost(
Opcode,
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
@@ -2767,13 +2857,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
}
case ExpressionTypes::MulAccReduction:
- return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
+ return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
+ Ctx.CostKind);
case ExpressionTypes::ExtMulAccReduction:
return Ctx.TTI.getMulAccReductionCost(
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
Instruction::ZExt,
- RedTy, SrcVecTy, Ctx.CostKind);
+ Opcode, RedTy, SrcVecTy, Ctx.CostKind);
}
llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
}
@@ -3014,23 +3105,75 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// instruction cost.
return 0;
case Instruction::Call: {
- if (!isSingleScalar()) {
- // TODO: Handle remaining call costs here as well.
- if (VF.isScalable())
- return InstructionCost::getInvalid();
- break;
- }
-
auto *CalledFn =
cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
- if (CalledFn->isIntrinsic())
- break;
+ SmallVector<const VPValue *> ArgOps(drop_end(operands()));
SmallVector<Type *, 4> Tys;
- for (VPValue *ArgOp : drop_end(operands()))
+ for (const VPValue *ArgOp : ArgOps)
Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
+
+ if (CalledFn->isIntrinsic())
+ // Various pseudo-intrinsics with costs of 0 are scalarized instead of
+ // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
+ switch (CalledFn->getIntrinsicID()) {
+ case Intrinsic::assume:
+ case Intrinsic::lifetime_end:
+ case Intrinsic::lifetime_start:
+ case Intrinsic::sideeffect:
+ case Intrinsic::pseudoprobe:
+ case Intrinsic::experimental_noalias_scope_decl: {
+ assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+ ElementCount::getFixed(1), Ctx) == 0 &&
+ "scalarizing intrinsic should be free");
+ return InstructionCost(0);
+ }
+ default:
+ break;
+ }
+
Type *ResultTy = Ctx.Types.inferScalarType(this);
- return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ InstructionCost ScalarCallCost =
+ Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ if (isSingleScalar()) {
+ if (CalledFn->isIntrinsic())
+ ScalarCallCost = std::min(
+ ScalarCallCost,
+ getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+ ElementCount::getFixed(1), Ctx));
+ return ScalarCallCost;
+ }
+
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+
+ // Compute the cost of scalarizing the result and operands if needed.
+ InstructionCost ScalarizationCost = 0;
+ if (VF.isVector()) {
+ if (!ResultTy->isVoidTy()) {
+ for (Type *VectorTy :
+ to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) {
+ ScalarizationCost += Ctx.TTI.getScalarizationOverhead(
+ cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+ /*Insert=*/true,
+ /*Extract=*/false, Ctx.CostKind);
+ }
+ }
+ // Skip operands that do not require extraction/scalarization and do not
+ // incur any overhead.
+ SmallPtrSet<const VPValue *, 4> UniqueOperands;
+ Tys.clear();
+ for (auto *Op : ArgOps) {
+ if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+ !UniqueOperands.insert(Op).second)
+ continue;
+ Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF));
+ }
+ ScalarizationCost +=
+ Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind);
+ }
+
+ return ScalarCallCost * VF.getFixedValue() + ScalarizationCost;
}
case Instruction::Add:
case Instruction::Sub:
@@ -3045,10 +3188,29 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
return *getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
Ctx) *
(isSingleScalar() ? 1 : VF.getFixedValue());
+ case Instruction::Load:
+ case Instruction::Store: {
+ if (isSingleScalar()) {
+ bool IsLoad = UI->getOpcode() == Instruction::Load;
+ Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+ Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
+ const Align Alignment = getLoadStoreAlignment(UI);
+ unsigned AS = getLoadStoreAddressSpace(UI);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+ InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+ UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
+ return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+ ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
+ }
+ // TODO: See getMemInstScalarizationCost for how to handle replicating and
+ // predicated cases.
+ break;
}
}
@@ -3181,10 +3343,17 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
// TODO: Using the original IR may not be accurate.
// Currently, ARM will use the underlying IR to calculate gather/scatter
// instruction cost.
- const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
- Type *PtrTy = toVectorTy(Ptr->getType(), VF);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
+
+ const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+ Type *PtrTy = Ptr->getType();
+
+ // If the address value is uniform across all lanes, then the address can be
+ // calculated with scalar type and broadcast.
+ if (!vputils::isSingleScalar(getAddr()))
+ PtrTy = toVectorTy(PtrTy, VF);
+
return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
Ctx.CostKind) +
Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
@@ -3532,9 +3701,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Interleave group being replicated.");
- assert((!NeedsMaskForGaps || !State.VF.isScalable()) &&
+ assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
"Masking gaps for scalable vectors is not yet supported.");
- const InterleaveGroup<Instruction> *Group = IG;
+ const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
Instruction *Instr = Group->getInsertPos();
// Prepare for the vector type of the interleaved load/store.
@@ -3574,7 +3743,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
Value *MaskForGaps = nullptr;
- if (NeedsMaskForGaps) {
+ if (needsMaskForGaps()) {
MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
@@ -3651,7 +3820,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved store group.
Value *MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
- assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) &&
+ assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
"Mismatch between NeedsMaskForGaps and MaskForGaps");
ArrayRef<VPValue *> StoredValues = getStoredValues();
// Collect the stored vector from each member.
@@ -3702,6 +3871,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
+ const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
@@ -3730,8 +3900,152 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
+ assert(!State.Lane && "Interleave group being replicated.");
+ assert(State.VF.isScalable() &&
+ "Only support scalable VF for EVL tail-folding.");
+ assert(!needsMaskForGaps() &&
+ "Masking gaps for scalable vectors is not yet supported.");
+ const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
+ Instruction *Instr = Group->getInsertPos();
+
+ // Prepare for the vector type of the interleaved load/store.
+ Type *ScalarTy = getLoadStoreType(Instr);
+ unsigned InterleaveFactor = Group->getFactor();
+ assert(InterleaveFactor <= 8 &&
+ "Unsupported deinterleave/interleave factor for scalable vectors");
+ ElementCount WideVF = State.VF * InterleaveFactor;
+ auto *VecTy = VectorType::get(ScalarTy, WideVF);
+
+ VPValue *Addr = getAddr();
+ Value *ResAddr = State.get(Addr, VPLane(0));
+ Value *EVL = State.get(getEVL(), VPLane(0));
+ Value *InterleaveEVL = State.Builder.CreateMul(
+ EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
+ /* NUW= */ true, /* NSW= */ true);
+ LLVMContext &Ctx = State.Builder.getContext();
+
+ Value *GroupMask = nullptr;
+ if (VPValue *BlockInMask = getMask()) {
+ SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
+ GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
+ } else {
+ GroupMask =
+ State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
+ }
+
+ // Vectorize the interleaved load group.
+ if (isa<LoadInst>(Instr)) {
+ CallInst *NewLoad = State.Builder.CreateIntrinsic(
+ VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
+ "wide.vp.load");
+ NewLoad->addParamAttr(0,
+ Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+ applyMetadata(*NewLoad);
+ // TODO: Also manage existing metadata using VPIRMetadata.
+ Group->addMetadata(NewLoad);
+
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ NewLoad = State.Builder.CreateIntrinsic(
+ Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+ NewLoad->getType(), NewLoad,
+ /*FMFSource=*/nullptr, "strided.vec");
+
+ const DataLayout &DL = Instr->getDataLayout();
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+ // Skip the gaps in the group.
+ if (!Member)
+ continue;
+
+ Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
+ VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
+ StridedVec =
+ createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
+ }
+
+ State.set(getVPValue(J), StridedVec);
+ ++J;
+ }
+ return;
+ } // End for interleaved load.
+
+ // The sub vector type for current instruction.
+ auto *SubVT = VectorType::get(ScalarTy, State.VF);
+ // Vectorize the interleaved store group.
+ ArrayRef<VPValue *> StoredValues = getStoredValues();
+ // Collect the stored vector from each member.
+ SmallVector<Value *, 4> StoredVecs;
+ const DataLayout &DL = Instr->getDataLayout();
+ for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
+ Instruction *Member = Group->getMember(I);
+ // Skip the gaps in the group.
+ if (!Member) {
+ StoredVecs.push_back(PoisonValue::get(SubVT));
+ continue;
+ }
+
+ Value *StoredVec = State.get(StoredValues[StoredIdx]);
+ // If this member has different type, cast it to a unified type.
+ if (StoredVec->getType() != SubVT)
+ StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
+
+ StoredVecs.push_back(StoredVec);
+ ++StoredIdx;
+ }
+
+ // Interleave all the smaller vectors into one wider vector.
+ Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
+ CallInst *NewStore =
+ State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
+ {IVec, ResAddr, GroupMask, InterleaveEVL});
+ NewStore->addParamAttr(1,
+ Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+ applyMetadata(*NewStore);
+ // TODO: Also manage existing metadata using VPIRMetadata.
+ Group->addMetadata(NewStore);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
+ O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ O << ", ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getEVL()->printAsOperand(O, SlotTracker);
+ if (VPValue *Mask = getMask()) {
+ O << ", ";
+ Mask->printAsOperand(O, SlotTracker);
+ }
+
+ unsigned OpIdx = 0;
+ for (unsigned i = 0; i < IG->getFactor(); ++i) {
+ if (!IG->getMember(i))
+ continue;
+ if (getNumStoreOperands() > 0) {
+ O << "\n" << Indent << " vp.store ";
+ getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
+ O << " to index " << i;
+ } else {
+ O << "\n" << Indent << " ";
+ getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
+ O << " = vp.load from index " << i;
+ }
+ ++OpIdx;
+ }
+}
+#endif
+
+InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
Instruction *InsertPos = getInsertPos();
// Find the VPValue index of the interleave group. We need to skip gaps.
unsigned InsertPosIdx = 0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e0bf241c73fd..2cac5557daee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -29,6 +29,7 @@
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
@@ -39,6 +40,10 @@
using namespace llvm;
using namespace VPlanPatternMatch;
+cl::opt<bool> EnableWideActiveLaneMask(
+ "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+ cl::desc("Enable use of wide get active lane mask instructions"));
+
bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
VPlanPtr &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
@@ -142,7 +147,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
for (VPValue *Op : Recipe.operands())
if (auto *Def =
dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- WorkList.insert(std::make_pair(VPBB, Def));
+ WorkList.insert({VPBB, Def});
}
}
@@ -206,7 +211,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
for (VPValue *Op : SinkCandidate->operands())
if (auto *Def =
dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- WorkList.insert(std::make_pair(SinkTo, Def));
+ WorkList.insert({SinkTo, Def});
Changed = true;
}
return Changed;
@@ -344,7 +349,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
auto *BlockInMask = PredRecipe->getMask();
auto *MaskDef = BlockInMask->getDefiningRecipe();
auto *BOMRecipe = new VPBranchOnMaskRecipe(
- BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());
+ BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
auto *Entry =
Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
@@ -859,8 +864,8 @@ static VPValue *optimizeLatchExitInductionUser(
Type *StepTy = TypeInfo.inferScalarType(Step);
auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
return B.createPtrAdd(EndValue,
- B.createNaryOp(Instruction::Sub, {Zero, Step}), {},
- "ind.escape");
+ B.createNaryOp(Instruction::Sub, {Zero, Step}),
+ DebugLoc::getUnknown(), "ind.escape");
}
if (ScalarTy->isFloatingPointTy()) {
const auto &ID = WideIV->getInductionDescriptor();
@@ -910,10 +915,10 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
if (!ExpR)
continue;
- auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
- if (I.second)
+ const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
+ if (Inserted)
continue;
- ExpR->replaceAllUsesWith(I.first->second);
+ ExpR->replaceAllUsesWith(V->second);
ExpR->eraseFromParent();
}
}
@@ -1067,7 +1072,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
// TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
// && (Y || Z) and (X || !X) into true. This requires queuing newly created
// recipes to be visited during simplification.
- VPValue *X, *Y;
+ VPValue *X, *Y, *Z;
if (match(Def,
m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) {
@@ -1076,13 +1081,37 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- // OR x, 1 -> 1.
- if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
- Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1)
- : Def->getOperand(0));
- Def->eraseFromParent();
- return;
- }
+ // x | 1 -> 1
+ if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
+ return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
+
+ // x | 0 -> x
+ if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
+ return Def->replaceAllUsesWith(X);
+
+ // x & 0 -> 0
+ if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
+ return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
+
+ // x && false -> false
+ if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
+ return Def->replaceAllUsesWith(Def->getOperand(1));
+
+ // (x && y) || (x && z) -> x && (y || z)
+ VPBuilder Builder(Def);
+ if (match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+ m_LogicalAnd(m_Deferred(X), m_VPValue(Z)))) &&
+ // Simplify only if one of the operands has one use to avoid creating an
+ // extra recipe.
+ (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
+ !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
+ return Def->replaceAllUsesWith(
+ Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
+
+ // x && !x -> 0
+ if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
+ return Def->replaceAllUsesWith(Plan->getOrAddLiveIn(
+ ConstantInt::getFalse(VPTypeAnalysis(*Plan).inferScalarType(Def))));
if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
return Def->replaceAllUsesWith(X);
@@ -1096,6 +1125,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
+ // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
+ // tail folding it is likely that x is a header mask and can be simplified
+ // further.
+ if (match(Def, m_LogicalAnd(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+ m_VPValue(Z))) &&
+ X->hasMoreThanOneUniqueUser())
+ return Def->replaceAllUsesWith(
+ Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
+
if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
return Def->replaceAllUsesWith(A);
@@ -1150,7 +1188,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
m_VPValue(X), m_SpecificInt(1)))) {
Type *WideStepTy = TypeInfo.inferScalarType(Def);
if (TypeInfo.inferScalarType(X) != WideStepTy)
- X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy);
+ X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
Def->replaceAllUsesWith(X);
return;
}
@@ -1232,11 +1270,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- VPInstruction *OpVPI;
- if (match(Def, m_ExtractLastElement(m_VPInstruction(OpVPI))) &&
- OpVPI->isVectorToScalar()) {
- Def->replaceAllUsesWith(OpVPI);
- return;
+ if (match(Def,
+ m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) &&
+ vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) {
+ return U->usesScalars(A) || Def == U;
+ })) {
+ return Def->replaceAllUsesWith(A);
}
}
@@ -1269,11 +1308,29 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
continue;
auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
+ if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
+ vputils::isSingleScalar(RepR->getOperand(1))) {
+ auto *Clone = new VPReplicateRecipe(
+ RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
+ true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
+ Clone->insertBefore(RepOrWidenR);
+ auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
+ {Clone->getOperand(0)});
+ Ext->insertBefore(Clone);
+ Clone->setOperand(0, Ext);
+ RepR->eraseFromParent();
+ continue;
+ }
+
// Skip recipes that aren't single scalars or don't have only their
// scalar results used. In the latter case, we would introduce extra
// broadcasts.
if (!vputils::isSingleScalar(RepOrWidenR) ||
- !vputils::onlyScalarValuesUsed(RepOrWidenR))
+ !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
+ return U->usesScalars(RepOrWidenR) ||
+ match(cast<VPRecipeBase>(U),
+ m_ExtractLastElement(m_VPValue()));
+ }))
continue;
auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
@@ -1285,6 +1342,23 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
}
}
+/// Try to see if all of \p Blend's masks share a common value logically and'ed
+/// and remove it from the masks.
+static void removeCommonBlendMask(VPBlendRecipe *Blend) {
+ if (Blend->isNormalized())
+ return;
+ VPValue *CommonEdgeMask;
+ if (!match(Blend->getMask(0),
+ m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
+ return;
+ for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
+ if (!match(Blend->getMask(I),
+ m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
+ return;
+ for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
+ Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
+}
+
/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
/// to make sure the masks are simplified.
static void simplifyBlends(VPlan &Plan) {
@@ -1295,6 +1369,8 @@ static void simplifyBlends(VPlan &Plan) {
if (!Blend)
continue;
+ removeCommonBlendMask(Blend);
+
// Try to remove redundant blend recipes.
SmallPtrSet<VPValue *, 4> UniqueValues;
if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
@@ -1467,6 +1543,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
}
+/// Try to replace multiple active lane masks used for control flow with
+/// a single, wide active lane mask instruction followed by multiple
+/// extract subvector intrinsics. This applies to the active lane mask
+/// instructions both in the loop and in the preheader.
+/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
+/// new extracts from the first active lane mask, which has it's last
+/// operand (multiplier) set to UF.
+static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
+ unsigned UF) {
+ if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
+ return false;
+
+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
+ auto *Term = &ExitingVPBB->back();
+
+ using namespace llvm::VPlanPatternMatch;
+ if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
+ m_VPValue(), m_VPValue(), m_VPValue())))))
+ return false;
+
+ auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+ LLVMContext &Ctx = Plan.getContext();
+
+ auto ExtractFromALM = [&](VPInstruction *ALM,
+ SmallVectorImpl<VPValue *> &Extracts) {
+ DebugLoc DL = ALM->getDebugLoc();
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<VPValue *> Ops;
+ Ops.append({ALM, Plan.getOrAddLiveIn(
+ ConstantInt::get(IntegerType::getInt64Ty(Ctx),
+ VF.getKnownMinValue() * Part))});
+ auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
+ IntegerType::getInt1Ty(Ctx), DL);
+ Extracts[Part] = Ext;
+ Ext->insertAfter(ALM);
+ }
+ };
+
+ // Create a list of each active lane mask phi, ordered by unroll part.
+ SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
+ for (VPRecipeBase &R : Header->phis()) {
+ auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
+ if (!Phi)
+ continue;
+ VPValue *Index = nullptr;
+ match(Phi->getBackedgeValue(),
+ m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue()));
+ assert(Index && "Expected index from ActiveLaneMask instruction");
+
+ auto *II = dyn_cast<VPInstruction>(Index);
+ if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
+ auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue());
+ Phis[Part->getZExtValue()] = Phi;
+ } else
+ // Anything other than a CanonicalIVIncrementForPart is part 0
+ Phis[0] = Phi;
+ }
+
+ assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
+ "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
+
+ auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
+ auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
+
+ assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
+ LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
+ "Expected incoming values of Phi to be ActiveLaneMasks");
+
+ // When using wide lane masks, the return type of the get.active.lane.mask
+ // intrinsic is VF x UF (last operand).
+ VPValue *ALMMultiplier =
+ Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+ EntryALM->setOperand(2, ALMMultiplier);
+ LoopALM->setOperand(2, ALMMultiplier);
+
+ // Create UF x extract vectors and insert into preheader.
+ SmallVector<VPValue *> EntryExtracts(UF);
+ ExtractFromALM(EntryALM, EntryExtracts);
+
+ // Create UF x extract vectors and insert before the loop compare & branch,
+ // updating the compare to use the first extract.
+ SmallVector<VPValue *> LoopExtracts(UF);
+ ExtractFromALM(LoopALM, LoopExtracts);
+ VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
+ Not->setOperand(0, LoopExtracts[0]);
+
+ // Update the incoming values of active lane mask phis.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Phis[Part]->setStartValue(EntryExtracts[Part]);
+ Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
+ }
+
+ return true;
+}
+
/// Try to simplify the branch condition of \p Plan. This may restrict the
/// resulting plan to \p BestVF and \p BestUF.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
@@ -1478,8 +1650,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
VPValue *Cond;
ScalarEvolution &SE = *PSE.getSE();
if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
- match(Term, m_BranchOnCond(
- m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
+ match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
+ m_VPValue(), m_VPValue(), m_VPValue()))))) {
// Try to simplify the branch condition if TC <= VF * UF when the latch
// terminator is BranchOnCount or BranchOnCond where the input is
// Not(ActiveLaneMask).
@@ -1558,8 +1730,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
- bool MadeChange =
- simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
+ bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
+ MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
if (MadeChange) {
@@ -1792,6 +1964,110 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
}
}
+namespace {
+struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
+ static bool isSentinel(const VPSingleDefRecipe *Def) {
+ return Def == getEmptyKey() || Def == getTombstoneKey();
+ }
+
+ /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
+ /// Returns an optional pair, where the first element indicates whether it is
+ /// an intrinsic ID.
+ static std::optional<std::pair<bool, unsigned>>
+ getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
+ return TypeSwitch<const VPSingleDefRecipe *,
+ std::optional<std::pair<bool, unsigned>>>(R)
+ .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
+ VPWidenSelectRecipe, VPReplicateRecipe>(
+ [](auto *I) { return std::make_pair(false, I->getOpcode()); })
+ .Case<VPWidenIntrinsicRecipe>([](auto *I) {
+ return std::make_pair(true, I->getVectorIntrinsicID());
+ })
+ .Default([](auto *) { return std::nullopt; });
+ }
+
+ /// Returns true if recipe \p Def can be safely handed for CSE.
+ static bool canHandle(const VPSingleDefRecipe *Def) {
+ // We can extend the list of handled recipes in the future,
+ // provided we account for the data embedded in them while checking for
+ // equality or hashing.
+ auto C = getOpcodeOrIntrinsicID(Def);
+
+ // The issue with (Insert|Extract)Value is that the index of the
+ // insert/extract is not a proper operand in LLVM IR, and hence also not in
+ // VPlan.
+ if (!C || (!C->first && (C->second == Instruction::InsertValue ||
+ C->second == Instruction::ExtractValue)))
+ return false;
+
+ // During CSE, we can only handle recipes that don't read from memory: if
+ // they read from memory, there could be an intervening write to memory
+ // before the next instance is CSE'd, leading to an incorrect result.
+ return !Def->mayReadFromMemory();
+ }
+
+ /// Hash the underlying data of \p Def.
+ static unsigned getHashValue(const VPSingleDefRecipe *Def) {
+ const VPlan *Plan = Def->getParent()->getPlan();
+ VPTypeAnalysis TypeInfo(*Plan);
+ hash_code Result = hash_combine(
+ Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
+ TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
+ hash_combine_range(Def->operands()));
+ if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
+ if (RFlags->hasPredicate())
+ return hash_combine(Result, RFlags->getPredicate());
+ return Result;
+ }
+
+ /// Check equality of underlying data of \p L and \p R.
+ static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
+ if (isSentinel(L) || isSentinel(R))
+ return L == R;
+ if (L->getVPDefID() != R->getVPDefID() ||
+ getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||
+ vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||
+ !equal(L->operands(), R->operands()))
+ return false;
+ if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
+ if (LFlags->hasPredicate() &&
+ LFlags->getPredicate() !=
+ cast<VPRecipeWithIRFlags>(R)->getPredicate())
+ return false;
+ const VPlan *Plan = L->getParent()->getPlan();
+ VPTypeAnalysis TypeInfo(*Plan);
+ return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
+ }
+};
+} // end anonymous namespace
+
+/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
+/// Plan.
+void VPlanTransforms::cse(VPlan &Plan) {
+ VPDominatorTree VPDT(Plan);
+ DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getEntry()))) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
+ if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
+ continue;
+ if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
+ // V must dominate Def for a valid replacement.
+ if (!VPDT.dominates(V->getParent(), VPBB))
+ continue;
+ // Only keep flags present on both V and Def.
+ if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
+ RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
+ Def->replaceAllUsesWith(V);
+ continue;
+ }
+ CSEMap[Def] = Def;
+ }
+ }
+}
+
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
static void licm(VPlan &Plan) {
VPBasicBlock *Preheader = Plan.getVectorPreheader();
@@ -1953,10 +2229,10 @@ void VPlanTransforms::optimize(VPlan &Plan) {
runPass(removeRedundantInductionCasts, Plan);
runPass(simplifyRecipes, Plan);
- runPass(simplifyBlends, Plan);
runPass(removeDeadRecipes, Plan);
- runPass(narrowToSingleScalarRecipes, Plan);
+ runPass(simplifyBlends, Plan);
runPass(legalizeAndOptimizeInductions, Plan);
+ runPass(narrowToSingleScalarRecipes, Plan);
runPass(removeRedundantExpandSCEVRecipes, Plan);
runPass(simplifyRecipes, Plan);
runPass(removeBranchOnConst, Plan);
@@ -2042,13 +2318,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
"index.part.next");
// Create the active lane mask instruction in the VPlan preheader.
- auto *EntryALM =
- Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
- DL, "active.lane.mask.entry");
+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+ ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+ auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+ {EntryIncrement, TC, ALMMultiplier}, DL,
+ "active.lane.mask.entry");
// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
- auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+ auto *LaneMaskPhi =
+ new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown());
LaneMaskPhi->insertAfter(CanonicalIVPHI);
// Create the active lane mask for the next iteration of the loop before the
@@ -2059,8 +2338,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
{IncrementValue}, {false, false}, DL);
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
- {InLoopIncrement, TripCount}, DL,
- "active.lane.mask.next");
+ {InLoopIncrement, TripCount, ALMMultiplier},
+ DL, "active.lane.mask.next");
LaneMaskPhi->addOperand(ALM);
// Replace the original terminator with BranchOnCond. We have to invert the
@@ -2077,12 +2356,10 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
/// for the header-mask pattern manually.
static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
SmallVector<VPValue *> WideCanonicalIVs;
- auto *FoundWidenCanonicalIVUser =
- find_if(Plan.getCanonicalIV()->users(),
- [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+ auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
+ IsaPred<VPWidenCanonicalIVRecipe>);
assert(count_if(Plan.getCanonicalIV()->users(),
- [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
- 1 &&
+ IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
"Must have at most one VPWideCanonicalIVRecipe");
if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
auto *WideCanonicalIV =
@@ -2125,9 +2402,8 @@ void VPlanTransforms::addActiveLaneMask(
"DataAndControlFlowWithoutRuntimeCheck implies "
"UseActiveLaneMaskForControlFlow");
- auto *FoundWidenCanonicalIVUser =
- find_if(Plan.getCanonicalIV()->users(),
- [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+ auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
+ IsaPred<VPWidenCanonicalIVRecipe>);
assert(FoundWidenCanonicalIVUser &&
"Must have widened canonical IV when tail folding!");
VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
@@ -2139,9 +2415,12 @@ void VPlanTransforms::addActiveLaneMask(
Plan, DataAndControlFlowWithoutRuntimeCheck);
} else {
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
- LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
- {WideCanonicalIV, Plan.getTripCount()}, nullptr,
- "active.lane.mask");
+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+ ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+ LaneMask =
+ B.createNaryOp(VPInstruction::ActiveLaneMask,
+ {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
+ nullptr, "active.lane.mask");
}
// Walk users of WideCanonicalIV and replace the header mask of the form
@@ -2205,6 +2484,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPValue *NewAddr = GetNewAddr(S->getAddr());
return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
})
+ .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
+ VPValue *NewMask = GetNewMask(IR->getMask());
+ return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
+ })
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
@@ -2271,11 +2554,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPBuilder Builder(LoopRegion->getPreheaderVPBB());
MaxEVL = Builder.createScalarZExtOrTrunc(
MaxEVL, Type::getInt32Ty(Plan.getContext()),
- TypeInfo.inferScalarType(MaxEVL), DebugLoc());
+ TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
Builder.setInsertPoint(Header, Header->getFirstNonPhi());
- VPValue *PrevEVL =
- Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
+ VPValue *PrevEVL = Builder.createScalarPhi(
+ {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
@@ -2327,16 +2610,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
if (!EVLRecipe)
continue;
- [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
+ unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
"New recipe must define the same number of values as the "
"original.");
- assert(NumDefVal <= 1 &&
- "Only supports recipes with a single definition or without users.");
EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
- VPValue *CurVPV = CurRecipe->getVPSingleValue();
- CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>(
+ EVLRecipe)) {
+ for (unsigned I = 0; I < NumDefVal; ++I) {
+ VPValue *CurVPV = CurRecipe->getVPValue(I);
+ CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
+ }
}
ToErase.push_back(CurRecipe);
}
@@ -2404,7 +2688,7 @@ void VPlanTransforms::addExplicitVectorLength(
VPValue *StartV = CanonicalIVPHI->getStartValue();
// Create the ExplicitVectorLengthPhi recipe in the main loop.
- auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
+ auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
EVLPhi->insertAfter(CanonicalIVPHI);
VPBuilder Builder(Header, Header->getFirstNonPhi());
// Create the AVL (application vector length), starting from TC -> 0 in steps
@@ -2418,10 +2702,11 @@ void VPlanTransforms::addExplicitVectorLength(
VPValue *AVLSafe =
Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
- AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
+ AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
+ "safe_avl");
}
auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
- DebugLoc());
+ DebugLoc::getUnknown());
auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
@@ -2473,6 +2758,22 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
+ VPValue *AVL;
+ [[maybe_unused]] bool FoundAVL =
+ match(EVLIncrement,
+ m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
+ assert(FoundAVL && "Didn't find AVL?");
+
+ // The AVL may be capped to a safe distance.
+ VPValue *SafeAVL;
+ if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
+ AVL = SafeAVL;
+
+ VPValue *AVLNext;
+ [[maybe_unused]] bool FoundAVLNext =
+ match(AVL, m_VPInstruction<Instruction::PHI>(
+ m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
+ assert(FoundAVLNext && "Didn't find AVL backedge?");
// Convert EVLPhi to concrete recipe.
auto *ScalarR =
@@ -2496,7 +2797,7 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
// Replace the use of VectorTripCount in the latch-exiting block.
// Before: (branch-on-count EVLIVInc, VectorTripCount)
- // After: (branch-on-count EVLIVInc, TripCount)
+ // After: (branch-on-cond eq AVLNext, 0)
VPBasicBlock *LatchExiting =
HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
@@ -2509,7 +2810,54 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
m_BranchOnCount(m_VPValue(EVLIncrement),
m_Specific(&Plan.getVectorTripCount()))) &&
"Unexpected terminator in EVL loop");
- LatchExitingBr->setOperand(1, Plan.getTripCount());
+
+ Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
+ VPBuilder Builder(LatchExitingBr);
+ VPValue *Cmp =
+ Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
+ Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));
+ Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
+ LatchExitingBr->eraseFromParent();
+}
+
+void VPlanTransforms::replaceSymbolicStrides(
+ VPlan &Plan, PredicatedScalarEvolution &PSE,
+ const DenseMap<Value *, const SCEV *> &StridesMap) {
+ // Replace VPValues for known constant strides guaranteed by predicate scalar
+ // evolution.
+ auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
+ auto *R = cast<VPRecipeBase>(&U);
+ return R->getParent()->getParent() ||
+ R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
+ };
+ for (const SCEV *Stride : StridesMap.values()) {
+ using namespace SCEVPatternMatch;
+ auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
+ const APInt *StrideConst;
+ if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
+ // Only handle constant strides for now.
+ continue;
+
+ auto *CI =
+ Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
+ if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
+ StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
+
+ // The versioned value may not be used in the loop directly but through a
+ // sext/zext. Add new live-ins in those cases.
+ for (Value *U : StrideV->users()) {
+ if (!isa<SExtInst, ZExtInst>(U))
+ continue;
+ VPValue *StrideVPV = Plan.getLiveIn(U);
+ if (!StrideVPV)
+ continue;
+ unsigned BW = U->getType()->getScalarSizeInBits();
+ APInt C =
+ isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
+ VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+ StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
+ }
+ }
}
void VPlanTransforms::dropPoisonGeneratingRecipes(
@@ -2785,8 +3133,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
- Init =
- Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
+ Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
+ DebugLoc::getUnknown(), "induction");
// Create the widened phi of the vector IV.
auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
@@ -2983,9 +3331,11 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
R->eraseFromParent();
}
-void VPlanTransforms::handleUncountableEarlyExit(
- VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
- VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
+void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
+ VPBasicBlock *EarlyExitVPBB,
+ VPlan &Plan,
+ VPBasicBlock *HeaderVPBB,
+ VPBasicBlock *LatchVPBB) {
VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
if (!EarlyExitVPBB->getSinglePredecessor() &&
EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
@@ -3038,13 +3388,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
}
VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
- auto IsVector = [](ElementCount VF) { return VF.isVector(); };
- // When the VFs are vectors, need to add `extract` to get the incoming value
- // from early exit. When the range contains scalar VF, limit the range to
- // scalar VF to prevent mis-compilation for the range containing both scalar
- // and vector VFs.
- if (!IncomingFromEarlyExit->isLiveIn() &&
- LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) {
+ if (!IncomingFromEarlyExit->isLiveIn()) {
// Update the incoming value from the early exit.
VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
@@ -3125,7 +3469,7 @@ static VPExpressionRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPCostContext &Ctx, VFRange &Range) {
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
- if (Opcode != Instruction::Add)
+ if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
return nullptr;
Type *RedTy = Ctx.Types.inferScalarType(Red);
@@ -3140,8 +3484,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
Type *SrcTy =
Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
- InstructionCost MulAccCost =
- Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);
+ InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
+ isZExt, Opcode, RedTy, SrcVecTy, CostKind);
InstructionCost MulCost = Mul->computeCost(VF, Ctx);
InstructionCost RedCost = Red->computeCost(VF, Ctx);
InstructionCost ExtCost = 0;
@@ -3506,6 +3850,21 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
Plan.resetTripCount(Exp);
ExpSCEV->eraseFromParent();
}
+ assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
+ "VPExpandSCEVRecipes must be at the beginning of the entry block, "
+ "after any VPIRInstructions");
+ // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
+ // to the VPIRBasicBlock.
+ auto EI = Entry->begin();
+ for (Instruction &I : drop_end(*EntryBB)) {
+ if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
+ &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
+ EI++;
+ continue;
+ }
+ VPIRInstruction::create(I)->insertBefore(*Entry, EI);
+ }
+
return ExpandedSCEVs;
}
@@ -3574,12 +3933,12 @@ static bool isAlreadyNarrow(VPValue *VPV) {
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth) {
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
- if (VF.isScalable() || !VectorLoop)
+ if (!VectorLoop)
return;
VPTypeAnalysis TypeInfo(Plan);
- unsigned FixedVF = VF.getFixedValue();
+ unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
@@ -3615,7 +3974,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
continue;
// Bail out on non-consecutive interleave groups.
- if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
+ if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
VectorRegWidth))
return;
@@ -3672,9 +4031,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
return;
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
- auto NarrowOp = [](VPValue *V) -> VPValue * {
+ SmallPtrSet<VPValue *, 4> NarrowedOps;
+ auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
auto *R = V->getDefiningRecipe();
- if (!R)
+ if (!R || NarrowedOps.contains(V))
return V;
if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
// Narrow interleave group to wide load, as transformed VPlan will only
@@ -3684,6 +4044,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
/*Reverse=*/false, {}, LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
+ NarrowedOps.insert(L);
return L;
}
@@ -3691,6 +4052,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
assert(RepR->isSingleScalar() &&
isa<LoadInst>(RepR->getUnderlyingInstr()) &&
"must be a single scalar load");
+ NarrowedOps.insert(RepR);
return RepR;
}
auto *WideLoad = cast<VPWidenLoadRecipe>(R);
@@ -3704,6 +4066,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
/*IsUniform*/ true,
/*Mask*/ nullptr, *WideLoad);
N->insertBefore(WideLoad);
+ NarrowedOps.insert(N);
return N;
};
@@ -3734,10 +4097,21 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// original iteration.
auto *CanIV = Plan.getCanonicalIV();
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
- Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
- CanIV->getScalarType(), 1 * Plan.getUF())));
- Plan.getVF().replaceAllUsesWith(
- Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+ VPBuilder PHBuilder(Plan.getVectorPreheader());
+
+ VPValue *UF = Plan.getOrAddLiveIn(
+ ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+ if (VF.isScalable()) {
+ VPValue *VScale = PHBuilder.createElementCount(
+ CanIV->getScalarType(), ElementCount::getScalable(1));
+ VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
+ Inc->setOperand(1, VScaleUF);
+ Plan.getVF().replaceAllUsesWith(VScale);
+ } else {
+ Inc->setOperand(1, UF);
+ Plan.getVF().replaceAllUsesWith(
+ Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+ }
removeDeadRecipes(Plan);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 700b94621d5f..1957428fab79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -62,16 +62,47 @@ struct VPlanTransforms {
/// The created loop is wrapped in an initial skeleton to facilitate
/// vectorization, consisting of a vector pre-header, an exit block for the
/// main vector loop (middle.block) and a new block as preheader of the scalar
- /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p
- /// InductionTy and \p IVDL, and creates a VPValue expression for the original
- /// trip count.
+ /// loop (scalar.ph). See below for an illustration. It also adds a canonical
+ /// IV and its increment, using \p InductionTy and \p IVDL, and creates a
+ /// VPValue expression for the original trip count.
+ ///
+ /// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
+ /// / \ old preheader. Will contain iteration number check and SCEV
+ /// | | expansions.
+ /// | |
+ /// / v
+ /// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be
+ /// | / | added later.
+ /// | / v
+ /// || [ ] <-- vector pre header.
+ /// |/ |
+ /// | v
+ /// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized.
+ /// | [ ]_|
+ /// | |
+ /// | v
+ /// | [ ] <--- middle-block with the branch to successors
+ /// | / |
+ /// | / |
+ /// | | v
+ /// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be
+ /// | | replaced later by a VPIRBasicBlock wrapping the scalar
+ /// | | preheader basic block.
+ /// | |
+ /// v <-- edge from middle to exit iff epilogue is not required.
+ /// | [ ] \
+ /// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue,
+ /// | | header wrapped in VPIRBasicBlock).
+ /// \ |
+ /// \ v
+ /// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
PredicatedScalarEvolution &PSE);
/// Update \p Plan to account for all early exits.
- LLVM_ABI_FOR_TEST static void
- handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range);
+ LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
+ bool HasUncountableExit);
/// If a check is needed to guard executing the scalar epilogue loop, it will
/// be added to the middle block.
@@ -79,6 +110,13 @@ struct VPlanTransforms {
bool RequiresScalarEpilogueCheck,
bool TailFolded);
+ // Create a check to \p Plan to see if the vector loop should be executed.
+ static void addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
+ bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
+ const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE);
+
/// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
/// flat CFG into a hierarchical CFG.
LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);
@@ -161,6 +199,12 @@ struct VPlanTransforms {
truncateToMinimalBitwidths(VPlan &Plan,
const MapVector<Instruction *, uint64_t> &MinBWs);
+ /// Replace symbolic strides from \p StridesMap in \p Plan with constants when
+ /// possible.
+ static void
+ replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE,
+ const DenseMap<Value *, const SCEV *> &StridesMap);
+
/// Drop poison flags from recipes that may generate a poison value that is
/// used after vectorization, even when their operands are not poison. Those
/// recipes meet the following conditions:
@@ -207,8 +251,7 @@ struct VPlanTransforms {
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
VPBasicBlock *EarlyExitVPBB,
VPlan &Plan, VPBasicBlock *HeaderVPBB,
- VPBasicBlock *LatchVPBB,
- VFRange &Range);
+ VPBasicBlock *LatchVPBB);
/// Replace loop regions with explicit CFG.
static void dissolveLoopRegions(VPlan &Plan);
@@ -220,9 +263,10 @@ struct VPlanTransforms {
/// variable vector lengths instead of fixed lengths. This transformation:
/// * Makes EVL-Phi concrete.
// * Removes CanonicalIV and increment.
- /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc,
- /// VectorTripCount) with variable-length stepping (branch-on-cond
- /// EVLIVInc, TripCount).
+ /// * Replaces the exit condition from
+ /// (branch-on-count CanonicalIVInc, VectorTripCount)
+ /// to
+ /// (branch-on-cond eq AVLNext, 0)
static void canonicalizeEVLLoops(VPlan &Plan);
/// Lower abstract recipes to concrete ones, that can be codegen'd.
@@ -242,6 +286,9 @@ struct VPlanTransforms {
/// removing dead edges to their successors.
static void removeBranchOnConst(VPlan &Plan);
+ /// Perform common-subexpression-elimination on \p Plan.
+ static void cse(VPlan &Plan);
+
/// If there's a single exit block, optimize its phi recipes that use exiting
/// IV values by feeding them precomputed end values instead, possibly taken
/// one step backwards.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 4bcde8cd5d42..443df167378b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -92,18 +92,18 @@ public:
void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
unsigned Part) {
for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
- auto Ins = VPV2Parts.insert({VPV, {}});
- assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
- Ins.first->second.push_back(CopyR->getVPValue(Idx));
+ const auto &[V, _] = VPV2Parts.try_emplace(VPV);
+ assert(V->second.size() == Part - 1 && "earlier parts not set");
+ V->second.push_back(CopyR->getVPValue(Idx));
}
}
/// Given a uniform recipe \p R, add it for all parts.
void addUniformForAllParts(VPSingleDefRecipe *R) {
- auto Ins = VPV2Parts.insert({R, {}});
- assert(Ins.second && "uniform value already added");
+ const auto &[V, Inserted] = VPV2Parts.try_emplace(R);
+ assert(Inserted && "uniform value already added");
for (unsigned Part = 0; Part != UF; ++Part)
- Ins.first->second.push_back(R);
+ V->second.push_back(R);
}
bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
@@ -536,16 +536,9 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
VPBuilder Builder(RepR);
if (RepR->getNumUsers() == 0) {
- if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
- vputils::isSingleScalar(RepR->getOperand(1))) {
- // Stores to invariant addresses need to store the last lane only.
- cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
- Def2LaneDefs);
- } else {
- // Create single-scalar version of RepR for all lanes.
- for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
- cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
- }
+ // Create single-scalar version of RepR for all lanes.
+ for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
+ cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
RepR->eraseFromParent();
continue;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 700a733bf9f2..c6c1ef336982 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -65,7 +65,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
VPValue *A, *B;
using namespace VPlanPatternMatch;
- if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B))))
+ if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1))))
return B == Plan.getTripCount() &&
(match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()),
m_SpecificInt(1),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 9e1d325a4d8d..77c099b27171 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -49,6 +49,8 @@ inline bool isSingleScalar(const VPValue *VPV) {
case Instruction::GetElementPtr:
case Instruction::ICmp:
case Instruction::FCmp:
+ case Instruction::Select:
+ case VPInstruction::Not:
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef..85c6c2c8d796 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -38,7 +38,7 @@ struct VPDoubleValueDef;
class VPSlotTracker;
class VPUser;
class VPRecipeBase;
-class VPInterleaveRecipe;
+class VPInterleaveBase;
class VPPhiAccessors;
// This is the base class of the VPlan Def/Use graph, used for modeling the data
@@ -48,7 +48,7 @@ class VPPhiAccessors;
class LLVM_ABI_FOR_TEST VPValue {
friend class VPDef;
friend struct VPDoubleValueDef;
- friend class VPInterleaveRecipe;
+ friend class VPInterleaveBase;
friend class VPlan;
friend class VPExpressionRecipe;
@@ -335,6 +335,7 @@ public:
VPExpressionSC,
VPIRInstructionSC,
VPInstructionSC,
+ VPInterleaveEVLSC,
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index e25ffe135418..99f3bc367a54 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
}
return VerifyEVLUse(*R, 2);
})
- .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
+ .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe,
+ VPInterleaveEVLRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
.Case<VPInstructionWithType>(
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
@@ -412,7 +413,7 @@ bool VPlanVerifier::verifyRegion(const VPRegionBlock *Region) {
const VPBlockBase *Exiting = Region->getExiting();
// Entry and Exiting shouldn't have any predecessor/successor, respectively.
- if (Entry->getNumPredecessors() != 0) {
+ if (Entry->hasPredecessors()) {
errs() << "region entry block has predecessors\n";
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 092a3a87954f..17cb18a22336 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -99,6 +99,10 @@ private:
InstructionWorklist Worklist;
+ /// Next instruction to iterate. It will be updated when it is erased by
+ /// RecursivelyDeleteTriviallyDeadInstructions.
+ Instruction *NextInst;
+
// TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
// parameter. That should be updated to specific sub-classes because the
// run loop was changed to dispatch on opcode.
@@ -118,6 +122,7 @@ private:
bool foldInsExtBinop(Instruction &I);
bool foldInsExtVectorToShuffle(Instruction &I);
bool foldBitOpOfCastops(Instruction &I);
+ bool foldBitOpOfCastConstant(Instruction &I);
bool foldBitcastShuffle(Instruction &I);
bool scalarizeOpOrCmp(Instruction &I);
bool scalarizeVPIntrinsic(Instruction &I);
@@ -169,13 +174,16 @@ private:
// further folds that were hindered by OneUse limits.
SmallPtrSet<Value *, 4> Visited;
for (Value *Op : Ops) {
- if (Visited.insert(Op).second) {
+ if (!Visited.contains(Op)) {
if (auto *OpI = dyn_cast<Instruction>(Op)) {
if (RecursivelyDeleteTriviallyDeadInstructions(
- OpI, nullptr, nullptr, [this](Value *V) {
- if (auto I = dyn_cast<Instruction>(V)) {
+ OpI, nullptr, nullptr, [&](Value *V) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
Worklist.remove(I);
+ if (I == NextInst)
+ NextInst = NextInst->getNextNode();
+ Visited.insert(I);
}
}))
continue;
@@ -862,14 +870,17 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
if (LHSSrc->getType() != RHSSrc->getType())
return false;
- // Only handle vector types with integer elements
- auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
- auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
- if (!SrcVecTy || !DstVecTy)
+ auto *SrcTy = LHSSrc->getType();
+ auto *DstTy = I.getType();
+ // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
+ // Other casts only handle vector types with integer elements.
+ if (CastOpcode != Instruction::BitCast &&
+ (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
return false;
- if (!SrcVecTy->getScalarType()->isIntegerTy() ||
- !DstVecTy->getScalarType()->isIntegerTy())
+ // Only integer scalar/vector values are legal for bitwise logic operations.
+ if (!SrcTy->getScalarType()->isIntegerTy() ||
+ !DstTy->getScalarType()->isIntegerTy())
return false;
// Cost Check :
@@ -877,23 +888,21 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
// NewCost = bitlogic + cast
// Calculate specific costs for each cast with instruction context
- InstructionCost LHSCastCost =
- TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
- TTI::CastContextHint::None, CostKind, LHSCast);
- InstructionCost RHSCastCost =
- TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
- TTI::CastContextHint::None, CostKind, RHSCast);
+ InstructionCost LHSCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
+ InstructionCost RHSCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) +
+ TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
LHSCastCost + RHSCastCost;
// For new cost, we can't provide an instruction (it doesn't exist yet)
InstructionCost GenericCastCost = TTI.getCastInstrCost(
- CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
InstructionCost NewCost =
- TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) +
+ TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
GenericCastCost;
// Account for multi-use casts using specific costs
@@ -930,6 +939,102 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
return true;
}
+/// Match:
+// bitop(castop(x), C) ->
+// bitop(castop(x), castop(InvC)) ->
+// castop(bitop(x, InvC))
+// Supports: bitcast
+bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
+ Instruction *LHS;
+ Constant *C;
+
+ // Check if this is a bitwise logic operation
+ if (!match(&I, m_c_BitwiseLogic(m_Instruction(LHS), m_Constant(C))))
+ return false;
+
+ // Get the cast instructions
+ auto *LHSCast = dyn_cast<CastInst>(LHS);
+ if (!LHSCast)
+ return false;
+
+ Instruction::CastOps CastOpcode = LHSCast->getOpcode();
+
+ // Only handle supported cast operations
+ switch (CastOpcode) {
+ case Instruction::BitCast:
+ break;
+ default:
+ return false;
+ }
+
+ Value *LHSSrc = LHSCast->getOperand(0);
+
+ auto *SrcTy = LHSSrc->getType();
+ auto *DstTy = I.getType();
+ // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
+ // Other casts only handle vector types with integer elements.
+ if (CastOpcode != Instruction::BitCast &&
+ (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
+ return false;
+
+ // Only integer scalar/vector values are legal for bitwise logic operations.
+ if (!SrcTy->getScalarType()->isIntegerTy() ||
+ !DstTy->getScalarType()->isIntegerTy())
+ return false;
+
+ // Find the constant InvC, such that castop(InvC) equals to C.
+ PreservedCastFlags RHSFlags;
+ Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
+ if (!InvC)
+ return false;
+
+ // Cost Check :
+ // OldCost = bitlogic + cast
+ // NewCost = bitlogic + cast
+
+ // Calculate specific costs for each cast with instruction context
+ InstructionCost LHSCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
+
+ InstructionCost OldCost =
+ TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
+
+ // For new cost, we can't provide an instruction (it doesn't exist yet)
+ InstructionCost GenericCastCost = TTI.getCastInstrCost(
+ CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
+
+ InstructionCost NewCost =
+ TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
+ GenericCastCost;
+
+ // Account for multi-use casts using specific costs
+ if (!LHSCast->hasOneUse())
+ NewCost += LHSCastCost;
+
+ LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
+ << " NewCost=" << NewCost << "\n");
+
+ if (NewCost > OldCost)
+ return false;
+
+ // Create the operation on the source type
+ Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
+ LHSSrc, InvC, I.getName() + ".inner");
+ if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
+ NewBinOp->copyIRFlags(&I);
+
+ Worklist.pushValue(NewOp);
+
+ // Create the cast operation directly to ensure we get a new instruction
+ Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
+
+ // Insert the new instruction
+ Value *Result = Builder.Insert(NewCast);
+
+ replaceValue(I, *Result);
+ return true;
+}
+
/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
/// destination type followed by shuffle. This can enable further transforms by
/// moving bitcasts or shuffles together.
@@ -1461,8 +1566,8 @@ static void analyzeCostOfVecReduction(const IntrinsicInst &II,
TTI::CastContextHint::None, CostKind, RedOp);
CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
- CostAfterReduction =
- TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind);
+ CostAfterReduction = TTI.getMulAccReductionCost(
+ IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
return;
}
CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
@@ -3753,6 +3858,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
unsigned MaxVectorSize =
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
+ if (MaxElementsInVector == 0)
+ return false;
// When there are multiple shufflevector operations on the same input,
// especially when the vector length is larger than the register size,
// identical shuffle patterns may occur across different groups of elements.
@@ -4467,6 +4574,8 @@ bool VectorCombine::run() {
case Instruction::Xor:
if (foldBitOpOfCastops(I))
return true;
+ if (foldBitOpOfCastConstant(I))
+ return true;
break;
case Instruction::PHI:
if (shrinkPhiOfShuffles(I))
@@ -4519,13 +4628,21 @@ bool VectorCombine::run() {
if (!DT.isReachableFromEntry(&BB))
continue;
// Use early increment range so that we can erase instructions in loop.
- for (Instruction &I : make_early_inc_range(BB)) {
- if (I.isDebugOrPseudoInst())
- continue;
- MadeChange |= FoldInst(I);
+ // make_early_inc_range is not applicable here, as the next iterator may
+ // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
+ // We manually maintain the next instruction and update it when it is about
+ // to be deleted.
+ Instruction *I = &BB.front();
+ while (I) {
+ NextInst = I->getNextNode();
+ if (!I->isDebugOrPseudoInst())
+ MadeChange |= FoldInst(*I);
+ I = NextInst;
}
}
+ NextInst = nullptr;
+
while (!Worklist.isEmpty()) {
Instruction *I = Worklist.removeOne();
if (!I)