diff options
Diffstat (limited to 'llvm/lib/Transforms')
73 files changed, 4207 insertions, 2025 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 40a7f8043034..40de36d81ddd 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -83,8 +83,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { // == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt)) if (match(V, m_OneUse(m_c_Or( m_Shl(m_Value(ShVal0), m_Value(ShAmt)), - m_LShr(m_Value(ShVal1), - m_Sub(m_SpecificInt(Width), m_Deferred(ShAmt))))))) { + m_LShr(m_Value(ShVal1), m_Sub(m_SpecificInt(Width), + m_Deferred(ShAmt))))))) { return Intrinsic::fshl; } @@ -617,7 +617,7 @@ struct LoadOps { LoadInst *RootInsert = nullptr; bool FoundRoot = false; uint64_t LoadSize = 0; - const APInt *Shift = nullptr; + uint64_t Shift = 0; Type *ZextType; AAMDNodes AATags; }; @@ -627,17 +627,15 @@ struct LoadOps { // (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3) static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, AliasAnalysis &AA) { - const APInt *ShAmt2 = nullptr; + uint64_t ShAmt2; Value *X; Instruction *L1, *L2; // Go to the last node with loads. - if (match(V, m_OneUse(m_c_Or( - m_Value(X), - m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))), - m_APInt(ShAmt2)))))) || - match(V, m_OneUse(m_Or(m_Value(X), - m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))))))) { + if (match(V, + m_OneUse(m_c_Or(m_Value(X), m_OneUse(m_ShlOrSelf( + m_OneUse(m_ZExt(m_Instruction(L2))), + ShAmt2)))))) { if (!foldLoadsRecursive(X, LOps, DL, AA) && LOps.FoundRoot) // Avoid Partial chain merge. return false; @@ -646,11 +644,10 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, // Check if the pattern has loads LoadInst *LI1 = LOps.Root; - const APInt *ShAmt1 = LOps.Shift; + uint64_t ShAmt1 = LOps.Shift; if (LOps.FoundRoot == false && - (match(X, m_OneUse(m_ZExt(m_Instruction(L1)))) || - match(X, m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L1)))), - m_APInt(ShAmt1)))))) { + match(X, m_OneUse( + m_ShlOrSelf(m_OneUse(m_ZExt(m_Instruction(L1))), ShAmt1)))) { LI1 = dyn_cast<LoadInst>(L1); } LoadInst *LI2 = dyn_cast<LoadInst>(L2); @@ -726,13 +723,6 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, if (IsBigEndian) std::swap(ShAmt1, ShAmt2); - // Find Shifts values. - uint64_t Shift1 = 0, Shift2 = 0; - if (ShAmt1) - Shift1 = ShAmt1->getZExtValue(); - if (ShAmt2) - Shift2 = ShAmt2->getZExtValue(); - // First load is always LI1. This is where we put the new load. // Use the merged load size available from LI1 for forward loads. if (LOps.FoundRoot) { @@ -747,7 +737,7 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1; uint64_t PrevSize = DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1)); - if ((Shift2 - Shift1) != ShiftDiff || (Offset2 - Offset1) != PrevSize) + if ((ShAmt2 - ShAmt1) != ShiftDiff || (Offset2 - Offset1) != PrevSize) return false; // Update LOps @@ -824,7 +814,7 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL, // Check if shift needed. We need to shift with the amount of load1 // shift if not zero. if (LOps.Shift) - NewOp = Builder.CreateShl(NewOp, ConstantInt::get(I.getContext(), *LOps.Shift)); + NewOp = Builder.CreateShl(NewOp, LOps.Shift); I.replaceAllUsesWith(NewOp); return true; @@ -860,11 +850,9 @@ static std::optional<PartStore> matchPartStore(Instruction &I, return std::nullopt; uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits(); - uint64_t ValOffset = 0; + uint64_t ValOffset; Value *Val; - if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val), - m_ConstantInt(ValOffset))), - m_Trunc(m_Value(Val))))) + if (!match(StoredVal, m_Trunc(m_LShrOrSelf(m_Value(Val), ValOffset)))) return std::nullopt; Value *Ptr = Store->getPointerOperand(); diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index b775c4346019..08f03aa45255 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -700,9 +700,6 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false); - assert(Shape.getPromiseAlloca() && - "Coroutine with switch ABI should own Promise alloca"); - DIFile *DFile = DIS->getFile(); unsigned LineNum = DIS->getLine(); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 180ac9c61e7d..02c38d02cff6 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1568,14 +1568,22 @@ private: if (DebugLoc SuspendLoc = S->getDebugLoc()) { std::string LabelName = ("__coro_resume_" + Twine(SuspendIndex)).str(); - DILocation &DILoc = *SuspendLoc; + // Take the "inlined at" location recursively, if present. This is + // mandatory as the DILabel insertion checks that the scopes of label + // and the attached location match. This is not the case when the + // suspend location has been inlined due to pointing to the original + // scope. + DILocation *DILoc = SuspendLoc; + while (DILocation *InlinedAt = DILoc->getInlinedAt()) + DILoc = InlinedAt; + DILabel *ResumeLabel = - DBuilder.createLabel(DIS, LabelName, DILoc.getFile(), + DBuilder.createLabel(DIS, LabelName, DILoc->getFile(), SuspendLoc.getLine(), SuspendLoc.getCol(), /*IsArtificial=*/true, /*CoroSuspendIdx=*/SuspendIndex, /*AlwaysPreserve=*/false); - DBuilder.insertLabel(ResumeLabel, &DILoc, ResumeBB->begin()); + DBuilder.insertLabel(ResumeLabel, DILoc, ResumeBB->begin()); } } diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 7bcb20de46ff..83aa7de5400f 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -40,6 +40,7 @@ #include "llvm/Support/JSON.h" #include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TimeProfiler.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -1550,6 +1551,7 @@ void llvm::computeDeadSymbolsWithConstProp( const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols, function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing, bool ImportEnabled) { + llvm::TimeTraceScope timeScope("Drop dead symbols and propagate attributes"); computeDeadSymbolsAndUpdateIndirectCalls(Index, GUIDPreservedSymbols, isPrevailing); if (ImportEnabled) @@ -1664,6 +1666,7 @@ bool llvm::convertToDeclaration(GlobalValue &GV) { void llvm::thinLTOFinalizeInModule(Module &TheModule, const GVSummaryMapTy &DefinedGlobals, bool PropagateAttrs) { + llvm::TimeTraceScope timeScope("ThinLTO finalize in module"); DenseSet<Comdat *> NonPrevailingComdats; auto FinalizeInModule = [&](GlobalValue &GV, bool Propagate = false) { // See if the global summary analysis computed a new resolved linkage. @@ -1791,6 +1794,7 @@ void llvm::thinLTOFinalizeInModule(Module &TheModule, /// Run internalization on \p TheModule based on symmary analysis. void llvm::thinLTOInternalizeModule(Module &TheModule, const GVSummaryMapTy &DefinedGlobals) { + llvm::TimeTraceScope timeScope("ThinLTO internalize module"); // Declare a callback for the internalize pass that will ask for every // candidate GlobalValue if it can be internalized or not. auto MustPreserveGV = [&](const GlobalValue &GV) -> bool { @@ -1885,6 +1889,7 @@ Expected<bool> FunctionImporter::importFunctions( // Do the actual import of functions now, one Module at a time for (const auto &ModName : ImportList.getSourceModules()) { + llvm::TimeTraceScope timeScope("Import", ModName); // Get the module for the import Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(ModName); if (!SrcModuleOrErr) @@ -1900,102 +1905,114 @@ Expected<bool> FunctionImporter::importFunctions( // Find the globals to import SetVector<GlobalValue *> GlobalsToImport; - for (Function &F : *SrcModule) { - if (!F.hasName()) - continue; - auto GUID = F.getGUID(); - auto MaybeImportType = ImportList.getImportType(ModName, GUID); - bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition; - - LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") - << " importing function" - << (ImportDefinition - ? " definition " - : (MaybeImportType ? " declaration " : " ")) - << GUID << " " << F.getName() << " from " - << SrcModule->getSourceFileName() << "\n"); - if (ImportDefinition) { - if (Error Err = F.materialize()) - return std::move(Err); - // MemProf should match function's definition and summary, - // 'thinlto_src_module' is needed. - if (EnableImportMetadata || EnableMemProfContextDisambiguation) { - // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for - // statistics and debugging. - F.setMetadata( - "thinlto_src_module", - MDNode::get(DestModule.getContext(), - {MDString::get(DestModule.getContext(), - SrcModule->getModuleIdentifier())})); - F.setMetadata( - "thinlto_src_file", - MDNode::get(DestModule.getContext(), - {MDString::get(DestModule.getContext(), - SrcModule->getSourceFileName())})); + { + llvm::TimeTraceScope functionsScope("Functions"); + for (Function &F : *SrcModule) { + if (!F.hasName()) + continue; + auto GUID = F.getGUID(); + auto MaybeImportType = ImportList.getImportType(ModName, GUID); + bool ImportDefinition = + MaybeImportType == GlobalValueSummary::Definition; + + LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") + << " importing function" + << (ImportDefinition + ? " definition " + : (MaybeImportType ? " declaration " : " ")) + << GUID << " " << F.getName() << " from " + << SrcModule->getSourceFileName() << "\n"); + if (ImportDefinition) { + if (Error Err = F.materialize()) + return std::move(Err); + // MemProf should match function's definition and summary, + // 'thinlto_src_module' is needed. + if (EnableImportMetadata || EnableMemProfContextDisambiguation) { + // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for + // statistics and debugging. + F.setMetadata( + "thinlto_src_module", + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getModuleIdentifier())})); + F.setMetadata( + "thinlto_src_file", + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getSourceFileName())})); + } + GlobalsToImport.insert(&F); } - GlobalsToImport.insert(&F); } } - for (GlobalVariable &GV : SrcModule->globals()) { - if (!GV.hasName()) - continue; - auto GUID = GV.getGUID(); - auto MaybeImportType = ImportList.getImportType(ModName, GUID); - bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition; - - LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") - << " importing global" - << (ImportDefinition - ? " definition " - : (MaybeImportType ? " declaration " : " ")) - << GUID << " " << GV.getName() << " from " - << SrcModule->getSourceFileName() << "\n"); - if (ImportDefinition) { - if (Error Err = GV.materialize()) - return std::move(Err); - ImportedGVCount += GlobalsToImport.insert(&GV); + { + llvm::TimeTraceScope globalsScope("Globals"); + for (GlobalVariable &GV : SrcModule->globals()) { + if (!GV.hasName()) + continue; + auto GUID = GV.getGUID(); + auto MaybeImportType = ImportList.getImportType(ModName, GUID); + bool ImportDefinition = + MaybeImportType == GlobalValueSummary::Definition; + + LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") + << " importing global" + << (ImportDefinition + ? " definition " + : (MaybeImportType ? " declaration " : " ")) + << GUID << " " << GV.getName() << " from " + << SrcModule->getSourceFileName() << "\n"); + if (ImportDefinition) { + if (Error Err = GV.materialize()) + return std::move(Err); + ImportedGVCount += GlobalsToImport.insert(&GV); + } } } - for (GlobalAlias &GA : SrcModule->aliases()) { - if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject())) - continue; - auto GUID = GA.getGUID(); - auto MaybeImportType = ImportList.getImportType(ModName, GUID); - bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition; - - LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") - << " importing alias" - << (ImportDefinition - ? " definition " - : (MaybeImportType ? " declaration " : " ")) - << GUID << " " << GA.getName() << " from " - << SrcModule->getSourceFileName() << "\n"); - if (ImportDefinition) { - if (Error Err = GA.materialize()) - return std::move(Err); - // Import alias as a copy of its aliasee. - GlobalObject *GO = GA.getAliaseeObject(); - if (Error Err = GO->materialize()) - return std::move(Err); - auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA); - LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() << " " - << GO->getName() << " from " + { + llvm::TimeTraceScope aliasesScope("Aliases"); + for (GlobalAlias &GA : SrcModule->aliases()) { + if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject())) + continue; + auto GUID = GA.getGUID(); + auto MaybeImportType = ImportList.getImportType(ModName, GUID); + bool ImportDefinition = + MaybeImportType == GlobalValueSummary::Definition; + + LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") + << " importing alias" + << (ImportDefinition + ? " definition " + : (MaybeImportType ? " declaration " : " ")) + << GUID << " " << GA.getName() << " from " << SrcModule->getSourceFileName() << "\n"); - if (EnableImportMetadata || EnableMemProfContextDisambiguation) { - // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for - // statistics and debugging. - Fn->setMetadata( - "thinlto_src_module", - MDNode::get(DestModule.getContext(), - {MDString::get(DestModule.getContext(), - SrcModule->getModuleIdentifier())})); - Fn->setMetadata( - "thinlto_src_file", - MDNode::get(DestModule.getContext(), - {MDString::get(DestModule.getContext(), - SrcModule->getSourceFileName())})); + if (ImportDefinition) { + if (Error Err = GA.materialize()) + return std::move(Err); + // Import alias as a copy of its aliasee. + GlobalObject *GO = GA.getAliaseeObject(); + if (Error Err = GO->materialize()) + return std::move(Err); + auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA); + LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() + << " " << GO->getName() << " from " + << SrcModule->getSourceFileName() << "\n"); + if (EnableImportMetadata || EnableMemProfContextDisambiguation) { + // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for + // statistics and debugging. + Fn->setMetadata( + "thinlto_src_module", + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getModuleIdentifier())})); + Fn->setMetadata( + "thinlto_src_file", + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getSourceFileName())})); + } + GlobalsToImport.insert(Fn); } - GlobalsToImport.insert(Fn); } } diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 9196a0147c43..30459caee160 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -89,6 +89,8 @@ static cl::opt<bool> SpecializeLiteralConstant( "Enable specialization of functions that take a literal constant as an " "argument")); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ) const { unsigned I = 0; @@ -784,9 +786,31 @@ bool FunctionSpecializer::run() { // Update the known call sites to call the clone. for (CallBase *Call : S.CallSites) { + Function *Clone = S.Clone; LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call - << " to call " << S.Clone->getName() << "\n"); + << " to call " << Clone->getName() << "\n"); Call->setCalledFunction(S.Clone); + auto &BFI = GetBFI(*Call->getFunction()); + std::optional<uint64_t> Count = + BFI.getBlockProfileCount(Call->getParent()); + if (Count && !ProfcheckDisableMetadataFixes) { + std::optional<llvm::Function::ProfileCount> MaybeCloneCount = + Clone->getEntryCount(); + assert(MaybeCloneCount && "Clone entry count was not set!"); + uint64_t CallCount = *Count + MaybeCloneCount->getCount(); + Clone->setEntryCount(CallCount); + if (std::optional<llvm::Function::ProfileCount> MaybeOriginalCount = + S.F->getEntryCount()) { + uint64_t OriginalCount = MaybeOriginalCount->getCount(); + if (OriginalCount >= CallCount) { + S.F->setEntryCount(OriginalCount - CallCount); + } else { + // This should generally not happen as that would mean there are + // more computed calls to the function than what was recorded. + LLVM_DEBUG(S.F->setEntryCount(0)); + } + } + } } Clones.push_back(S.Clone); @@ -838,14 +862,24 @@ bool FunctionSpecializer::run() { } void FunctionSpecializer::removeDeadFunctions() { - for (Function *F : FullySpecialized) { + for (Function *F : DeadFunctions) { LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function " << F->getName() << "\n"); if (FAM) FAM->clear(*F, F->getName()); + + // Remove all the callsites that were proven unreachable once, and replace + // them with poison. + for (User *U : make_early_inc_range(F->users())) { + assert((isa<CallInst>(U) || isa<InvokeInst>(U)) && + "User of dead function must be call or invoke"); + Instruction *CS = cast<Instruction>(U); + CS->replaceAllUsesWith(PoisonValue::get(CS->getType())); + CS->eraseFromParent(); + } F->eraseFromParent(); } - FullySpecialized.clear(); + DeadFunctions.clear(); } /// Clone the function \p F and remove the ssa_copy intrinsics added by @@ -1033,6 +1067,9 @@ Function *FunctionSpecializer::createSpecialization(Function *F, // clone must. Clone->setLinkage(GlobalValue::InternalLinkage); + if (F->getEntryCount() && !ProfcheckDisableMetadataFixes) + Clone->setEntryCount(0); + // Initialize the lattice state of the arguments of the function clone, // marking the argument on which we specialized the function constant // with the given value. @@ -1206,8 +1243,11 @@ void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin, // If the function has been completely specialized, the original function // is no longer needed. Mark it unreachable. - if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F)) { + // NOTE: If the address of a function is taken, we cannot treat it as dead + // function. + if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F) && + !F->hasAddressTaken()) { Solver.markFunctionUnreachable(F); - FullySpecialized.insert(F); + DeadFunctions.insert(F); } } diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index d7edd1288309..f88d51f443bc 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2551,7 +2551,8 @@ static bool OptimizeNonTrivialIFuncs( })) continue; - assert(!Callees.empty() && "Expecting successful collection of versions"); + if (Callees.empty()) + continue; LLVM_DEBUG(dbgs() << "Statically resolving calls to function " << Resolver->getName() << "\n"); diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index c57981ae4ca0..fdf0c3ac8007 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -686,9 +686,6 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group, /* Outlined code is optimized code by definition. */ DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized); - // Don't add any new variables to the subprogram. - DB.finalizeSubprogram(OutlinedSP); - // Attach subprogram to the function. F->setSubprogram(OutlinedSP); // We're done with the DIBuilder. diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 57844a10aa9c..821a9d82ddb0 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -504,10 +504,7 @@ class LowerTypeTestsModule { void importTypeTest(CallInst *CI); void importFunction(Function *F, bool isJumpTableCanonical); - BitSetInfo - buildBitSet(Metadata *TypeId, - const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout); - ByteArrayInfo *createByteArray(BitSetInfo &BSI); + ByteArrayInfo *createByteArray(const BitSetInfo &BSI); void allocateByteArrays(); Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL, Value *BitOffset); @@ -578,9 +575,9 @@ public: /// Build a bit set for TypeId using the object layouts in /// GlobalLayout. -BitSetInfo LowerTypeTestsModule::buildBitSet( - Metadata *TypeId, - const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) { +static BitSetInfo +buildBitSet(Metadata *TypeId, + const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) { BitSetBuilder BSB; // Compute the byte offset of each address associated with this type @@ -615,7 +612,7 @@ static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits, return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0)); } -ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) { +ByteArrayInfo *LowerTypeTestsModule::createByteArray(const BitSetInfo &BSI) { // Create globals to stand in for byte arrays and masks. These never actually // get initialized, we RAUW and erase them later in allocateByteArrays() once // we know the offset and mask to use. diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index b8c99f1f3389..7f9693169af0 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -3965,6 +3965,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( void ModuleCallsiteContextGraph::updateAllocationCall( CallInfo &Call, AllocationType AllocType) { std::string AllocTypeString = getAllocTypeAttributeString(AllocType); + removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call())); auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), "memprof", AllocTypeString); cast<CallBase>(Call.call())->addFnAttr(A); @@ -5501,6 +5502,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // clone J-1 (J==0 is the original clone and does not have a VMaps // entry). CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); + removeAnyExistingAmbiguousAttribute(CBClone); CBClone->addFnAttr(A); ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) << ore::NV("AllocationCall", CBClone) << " in clone " diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp index d50de34dfa48..2ecadd529170 100644 --- a/llvm/lib/Transforms/IPO/SCCP.cpp +++ b/llvm/lib/Transforms/IPO/SCCP.cpp @@ -169,6 +169,13 @@ static bool runIPSCCP( for (Function &F : M) { if (F.isDeclaration()) continue; + // Skip the dead functions marked by FunctionSpecializer, avoiding removing + // blocks in dead functions. Set MadeChanges if there is any dead function + // that will be removed later. + if (IsFuncSpecEnabled && Specializer.isDeadFunction(&F)) { + MadeChanges = true; + continue; + } SmallVector<BasicBlock *, 512> BlocksToErase; @@ -326,12 +333,15 @@ static bool runIPSCCP( LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n"); for (User *U : make_early_inc_range(GV->users())) { - // We can remove LoadInst here, because we already replaced its users - // with a constant. + // We can remove LoadInst here. The LoadInsts in dead functions marked by + // FuncSpec are not simplified to constants, thus poison them. assert((isa<StoreInst>(U) || isa<LoadInst>(U)) && "Only Store|Load Instruction can be user of GlobalVariable at " "reaching here."); - cast<Instruction>(U)->eraseFromParent(); + Instruction *I = cast<Instruction>(U); + if (isa<LoadInst>(I)) + I->replaceAllUsesWith(PoisonValue::get(I->getType())); + I->eraseFromParent(); } // Try to create a debug constant expression for the global variable diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 838f97c8f49a..2340fe556538 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -269,6 +269,12 @@ static bool enableUnifiedLTO(Module &M) { } #endif +bool mustEmitToMergedModule(const GlobalValue *GV) { + // The __cfi_check definition is filled in by the CrossDSOCFI pass which + // runs only in the merged module. + return GV->getName() == "__cfi_check"; +} + // If it's possible to split M into regular and thin LTO parts, do so and write // a multi-module bitcode file with the two parts to OS. Otherwise, write only a // regular LTO bitcode file to OS. @@ -350,19 +356,13 @@ void splitAndWriteThinLTOBitcode( }); } - auto MustEmitToMergedModule = [](const GlobalValue *GV) { - // The __cfi_check definition is filled in by the CrossDSOCFI pass which - // runs only in the merged module. - return GV->getName() == "__cfi_check"; - }; - ValueToValueMapTy VMap; std::unique_ptr<Module> MergedM( CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool { if (const auto *C = GV->getComdat()) if (MergedMComdats.count(C)) return true; - if (MustEmitToMergedModule(GV)) + if (mustEmitToMergedModule(GV)) return true; if (auto *F = dyn_cast<Function>(GV)) return EligibleVirtualFns.count(F); @@ -380,7 +380,7 @@ void splitAndWriteThinLTOBitcode( cloneUsedGlobalVariables(M, *MergedM, /*CompilerUsed*/ true); for (Function &F : *MergedM) - if (!F.isDeclaration() && !MustEmitToMergedModule(&F)) { + if (!F.isDeclaration() && !mustEmitToMergedModule(&F)) { // Reset the linkage of all functions eligible for virtual constant // propagation. The canonical definitions live in the thin LTO module so // that they can be imported. @@ -406,7 +406,7 @@ void splitAndWriteThinLTOBitcode( if (const auto *C = GV->getComdat()) if (MergedMComdats.count(C)) return false; - if (MustEmitToMergedModule(GV)) + if (mustEmitToMergedModule(GV)) return false; return true; }); @@ -529,11 +529,13 @@ bool enableSplitLTOUnit(Module &M) { return EnableSplitLTOUnit; } -// Returns whether this module needs to be split because it uses type metadata. -bool hasTypeMetadata(Module &M) { +// Returns whether this module needs to be split (if splitting is enabled). +bool requiresSplit(Module &M) { for (auto &GO : M.global_objects()) { if (GO.hasMetadata(LLVMContext::MD_type)) return true; + if (mustEmitToMergedModule(&GO)) + return true; } return false; } @@ -543,9 +545,9 @@ bool writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS, Module &M, const ModuleSummaryIndex *Index, const bool ShouldPreserveUseListOrder) { std::unique_ptr<ModuleSummaryIndex> NewIndex = nullptr; - // See if this module has any type metadata. If so, we try to split it + // See if this module needs to be split. If so, we try to split it // or at least promote type ids to enable WPD. - if (hasTypeMetadata(M)) { + if (requiresSplit(M)) { if (enableSplitLTOUnit(M)) { splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M, ShouldPreserveUseListOrder); diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index aec484f8a18f..bfb25c806e53 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -60,6 +60,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/Bitcode/BitcodeReader.h" @@ -68,6 +69,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" @@ -82,12 +84,15 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSummaryIndexYAML.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/GlobPattern.h" +#include "llvm/Support/TimeProfiler.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/FunctionAttrs.h" @@ -95,6 +100,7 @@ #include "llvm/Transforms/Utils/CallPromotionUtils.h" #include "llvm/Transforms/Utils/Evaluator.h" #include <algorithm> +#include <cmath> #include <cstddef> #include <map> #include <set> @@ -167,6 +173,8 @@ static cl::list<std::string> cl::desc("Prevent function(s) from being devirtualized"), cl::Hidden, cl::CommaSeparated); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + /// With Clang, a pure virtual class's deleting destructor is emitted as a /// `llvm.trap` intrinsic followed by an unreachable IR instruction. In the /// context of whole program devirtualization, the deleting destructor of a pure @@ -451,21 +459,21 @@ struct VirtualCallSite { void emitRemark(const StringRef OptName, const StringRef TargetName, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { + function_ref<OptimizationRemarkEmitter &(Function &)> OREGetter) { Function *F = CB.getCaller(); DebugLoc DLoc = CB.getDebugLoc(); BasicBlock *Block = CB.getParent(); using namespace ore; - OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block) - << NV("Optimization", OptName) - << ": devirtualized a call to " - << NV("FunctionName", TargetName)); + OREGetter(*F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block) + << NV("Optimization", OptName) + << ": devirtualized a call to " + << NV("FunctionName", TargetName)); } void replaceAndErase( const StringRef OptName, const StringRef TargetName, bool RemarksEnabled, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, + function_ref<OptimizationRemarkEmitter &(Function &)> OREGetter, Value *New) { if (RemarksEnabled) emitRemark(OptName, TargetName, OREGetter); @@ -570,25 +578,24 @@ void VTableSlotInfo::addCallSite(Value *VTable, CallBase &CB, struct DevirtModule { Module &M; - function_ref<AAResults &(Function &)> AARGetter; - function_ref<DominatorTree &(Function &)> LookupDomTree; + ModuleAnalysisManager &MAM; + FunctionAnalysisManager &FAM; - ModuleSummaryIndex *ExportSummary; - const ModuleSummaryIndex *ImportSummary; + ModuleSummaryIndex *const ExportSummary; + const ModuleSummaryIndex *const ImportSummary; - IntegerType *Int8Ty; - PointerType *Int8PtrTy; - IntegerType *Int32Ty; - IntegerType *Int64Ty; - IntegerType *IntPtrTy; + IntegerType *const Int8Ty; + PointerType *const Int8PtrTy; + IntegerType *const Int32Ty; + IntegerType *const Int64Ty; + IntegerType *const IntPtrTy; /// Sizeless array type, used for imported vtables. This provides a signal /// to analyzers that these imports may alias, as they do for example /// when multiple unique return values occur in the same vtable. - ArrayType *Int8Arr0Ty; - - bool RemarksEnabled; - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; + ArrayType *const Int8Arr0Ty; + const bool RemarksEnabled; + std::function<OptimizationRemarkEmitter &(Function &)> OREGetter; MapVector<VTableSlot, VTableSlotInfo> CallSlots; // Calls that have already been optimized. We may add a call to multiple @@ -611,12 +618,11 @@ struct DevirtModule { std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest; PatternList FunctionsToSkip; - DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, - function_ref<DominatorTree &(Function &)> LookupDomTree, + DevirtModule(Module &M, ModuleAnalysisManager &MAM, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary) - : M(M), AARGetter(AARGetter), LookupDomTree(LookupDomTree), + : M(M), MAM(MAM), + FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()), ExportSummary(ExportSummary), ImportSummary(ImportSummary), Int8Ty(Type::getInt8Ty(M.getContext())), Int8PtrTy(PointerType::getUnqual(M.getContext())), @@ -624,7 +630,10 @@ struct DevirtModule { Int64Ty(Type::getInt64Ty(M.getContext())), IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)), Int8Arr0Ty(ArrayType::get(Type::getInt8Ty(M.getContext()), 0)), - RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) { + RemarksEnabled(areRemarksEnabled()), + OREGetter([&](Function &F) -> OptimizationRemarkEmitter & { + return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + }) { assert(!(ExportSummary && ImportSummary)); FunctionsToSkip.init(SkipFunctionNames); } @@ -653,7 +662,7 @@ struct DevirtModule { VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res); - void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT, + void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Function &JT, bool &IsExported); void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, @@ -738,10 +747,7 @@ struct DevirtModule { // Lower the module using the action and summary passed as command line // arguments. For testing purposes only. - static bool - runForTesting(Module &M, function_ref<AAResults &(Function &)> AARGetter, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, - function_ref<DominatorTree &(Function &)> LookupDomTree); + static bool runForTesting(Module &M, ModuleAnalysisManager &MAM); }; struct DevirtIndex { @@ -782,25 +788,13 @@ struct DevirtIndex { } // end anonymous namespace PreservedAnalyses WholeProgramDevirtPass::run(Module &M, - ModuleAnalysisManager &AM) { - auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - auto AARGetter = [&](Function &F) -> AAResults & { - return FAM.getResult<AAManager>(F); - }; - auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { - return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); - }; - auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & { - return FAM.getResult<DominatorTreeAnalysis>(F); - }; + ModuleAnalysisManager &MAM) { if (UseCommandLine) { - if (!DevirtModule::runForTesting(M, AARGetter, OREGetter, LookupDomTree)) + if (!DevirtModule::runForTesting(M, MAM)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } - if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary, - ImportSummary) - .run()) + if (!DevirtModule(M, MAM, ExportSummary, ImportSummary).run()) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } @@ -832,8 +826,8 @@ typeIDVisibleToRegularObj(StringRef TypeID, // function for the base type and thus only contains a reference to the // type info (_ZTI). To catch this case we query using the type info // symbol corresponding to the TypeID. - std::string typeInfo = ("_ZTI" + TypeID).str(); - return IsVisibleToRegularObj(typeInfo); + std::string TypeInfo = ("_ZTI" + TypeID).str(); + return IsVisibleToRegularObj(TypeInfo); } static bool @@ -842,7 +836,7 @@ skipUpdateDueToValidation(GlobalVariable &GV, SmallVector<MDNode *, 2> Types; GV.getMetadata(LLVMContext::MD_type, Types); - for (auto Type : Types) + for (auto *Type : Types) if (auto *TypeID = dyn_cast<MDString>(Type->getOperand(1).get())) return typeIDVisibleToRegularObj(TypeID->getString(), IsVisibleToRegularObj); @@ -881,6 +875,7 @@ void llvm::updateVCallVisibilityInModule( void llvm::updatePublicTypeTestCalls(Module &M, bool WholeProgramVisibilityEnabledInLTO) { + llvm::TimeTraceScope timeScope("Update public type test calls"); Function *PublicTypeTestFunc = Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test); if (!PublicTypeTestFunc) @@ -912,9 +907,9 @@ void llvm::getVisibleToRegularObjVtableGUIDs( ModuleSummaryIndex &Index, DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols, function_ref<bool(StringRef)> IsVisibleToRegularObj) { - for (const auto &typeID : Index.typeIdCompatibleVtableMap()) { - if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj)) - for (const TypeIdOffsetVtableInfo &P : typeID.second) + for (const auto &TypeID : Index.typeIdCompatibleVtableMap()) { + if (typeIDVisibleToRegularObj(TypeID.first, IsVisibleToRegularObj)) + for (const TypeIdOffsetVtableInfo &P : TypeID.second) VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID()); } } @@ -957,7 +952,7 @@ void llvm::runWholeProgramDevirtOnIndex( void llvm::updateIndexWPDForExports( ModuleSummaryIndex &Summary, - function_ref<bool(StringRef, ValueInfo)> isExported, + function_ref<bool(StringRef, ValueInfo)> IsExported, std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) { for (auto &T : LocalWPDTargetsMap) { auto &VI = T.first; @@ -965,7 +960,7 @@ void llvm::updateIndexWPDForExports( assert(VI.getSummaryList().size() == 1 && "Devirt of local target has more than one copy"); auto &S = VI.getSummaryList()[0]; - if (!isExported(S->modulePath(), VI)) + if (!IsExported(S->modulePath(), VI)) continue; // It's been exported by a cross module import. @@ -995,10 +990,7 @@ static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) { return ErrorSuccess(); } -bool DevirtModule::runForTesting( - Module &M, function_ref<AAResults &(Function &)> AARGetter, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, - function_ref<DominatorTree &(Function &)> LookupDomTree) { +bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) { std::unique_ptr<ModuleSummaryIndex> Summary = std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false); @@ -1023,7 +1015,7 @@ bool DevirtModule::runForTesting( } bool Changed = - DevirtModule(M, AARGetter, OREGetter, LookupDomTree, + DevirtModule(M, MAM, ClSummaryAction == PassSummaryAction::Export ? Summary.get() : nullptr, ClSummaryAction == PassSummaryAction::Import ? Summary.get() @@ -1071,7 +1063,7 @@ void DevirtModule::buildTypeIdentifierMap( } for (MDNode *Type : Types) { - auto TypeID = Type->getOperand(1).get(); + auto *TypeID = Type->getOperand(1).get(); uint64_t Offset = cast<ConstantInt>( @@ -1120,7 +1112,7 @@ bool DevirtModule::tryFindVirtualCallTargets( // Save the symbol used in the vtable to use as the devirtualization // target. - auto GV = dyn_cast<GlobalValue>(C); + auto *GV = dyn_cast<GlobalValue>(C); assert(GV); TargetsForSlot.push_back({GV, &TM}); } @@ -1284,7 +1276,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, Apply(P.second); } -static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) { +static bool addCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) { // We can't add calls if we haven't seen a definition if (Callee.getSummaryList().empty()) return false; @@ -1359,7 +1351,7 @@ bool DevirtModule::trySingleImplDevirt( if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID())) // Any needed promotion of 'TheFn' has already been done during // LTO unit split, so we can ignore return value of AddCalls. - AddCalls(SlotInfo, TheFnVI); + addCalls(SlotInfo, TheFnVI); Res->TheKind = WholeProgramDevirtResolution::SingleImpl; Res->SingleImplName = std::string(TheFn->getName()); @@ -1400,7 +1392,7 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot, DevirtTargets.insert(TheFn); auto &S = TheFn.getSummaryList()[0]; - bool IsExported = AddCalls(SlotInfo, TheFn); + bool IsExported = addCalls(SlotInfo, TheFn); if (IsExported) ExportedGUIDs.insert(TheFn.getGUID()); @@ -1497,13 +1489,19 @@ void DevirtModule::tryICallBranchFunnel( ReturnInst::Create(M.getContext(), nullptr, BB); bool IsExported = false; - applyICallBranchFunnel(SlotInfo, JT, IsExported); + applyICallBranchFunnel(SlotInfo, *JT, IsExported); if (IsExported) Res->TheKind = WholeProgramDevirtResolution::BranchFunnel; + + if (!JT->getEntryCount().has_value()) { + // FIXME: we could pass through thinlto the necessary information. + setExplicitlyUnknownFunctionEntryCount(*JT); + } } void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, - Constant *JT, bool &IsExported) { + Function &JT, bool &IsExported) { + DenseMap<Function *, double> FunctionEntryCounts; auto Apply = [&](CallSiteInfo &CSInfo) { if (CSInfo.isExported()) IsExported = true; @@ -1531,8 +1529,7 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, NumBranchFunnel++; if (RemarksEnabled) - VCallSite.emitRemark("branch-funnel", - JT->stripPointerCasts()->getName(), OREGetter); + VCallSite.emitRemark("branch-funnel", JT.getName(), OREGetter); // Pass the address of the vtable in the nest register, which is r10 on // x86_64. @@ -1548,11 +1545,28 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, llvm::append_range(Args, CB.args()); CallBase *NewCS = nullptr; + if (!JT.isDeclaration() && !ProfcheckDisableMetadataFixes) { + // Accumulate the call frequencies of the original call site, and use + // that as total entry count for the funnel function. + auto &F = *CB.getCaller(); + auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); + auto EC = BFI.getBlockFreq(&F.getEntryBlock()); + auto CC = F.getEntryCount(/*AllowSynthetic=*/true); + double CallCount = 0.0; + if (EC.getFrequency() != 0 && CC && CC->getCount() != 0) { + double CallFreq = + static_cast<double>( + BFI.getBlockFreq(CB.getParent()).getFrequency()) / + EC.getFrequency(); + CallCount = CallFreq * CC->getCount(); + } + FunctionEntryCounts[&JT] += CallCount; + } if (isa<CallInst>(CB)) - NewCS = IRB.CreateCall(NewFT, JT, Args); + NewCS = IRB.CreateCall(NewFT, &JT, Args); else NewCS = - IRB.CreateInvoke(NewFT, JT, cast<InvokeInst>(CB).getNormalDest(), + IRB.CreateInvoke(NewFT, &JT, cast<InvokeInst>(CB).getNormalDest(), cast<InvokeInst>(CB).getUnwindDest(), Args); NewCS->setCallingConv(CB.getCallingConv()); @@ -1586,6 +1600,11 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Apply(SlotInfo.CSInfo); for (auto &P : SlotInfo.ConstCSInfo) Apply(P.second); + for (auto &[F, C] : FunctionEntryCounts) { + assert(!F->getEntryCount(/*AllowSynthetic=*/true) && + "Unexpected entry count for funnel that was freshly synthesized"); + F->setEntryCount(static_cast<uint64_t>(std::round(C))); + } } bool DevirtModule::tryEvaluateFunctionsWithArgs( @@ -1597,7 +1616,7 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs( // TODO: Skip for now if the vtable symbol was an alias to a function, // need to evaluate whether it would be correct to analyze the aliasee // function for this optimization. - auto Fn = dyn_cast<Function>(Target.Fn); + auto *Fn = dyn_cast<Function>(Target.Fn); if (!Fn) return false; @@ -1836,11 +1855,11 @@ bool DevirtModule::tryVirtualConstProp( // TODO: Skip for now if the vtable symbol was an alias to a function, // need to evaluate whether it would be correct to analyze the aliasee // function for this optimization. - auto Fn = dyn_cast<Function>(TargetsForSlot[0].Fn); + auto *Fn = dyn_cast<Function>(TargetsForSlot[0].Fn); if (!Fn) return false; // This only works if the function returns an integer. - auto RetType = dyn_cast<IntegerType>(Fn->getReturnType()); + auto *RetType = dyn_cast<IntegerType>(Fn->getReturnType()); if (!RetType) return false; unsigned BitWidth = RetType->getBitWidth(); @@ -1871,12 +1890,12 @@ bool DevirtModule::tryVirtualConstProp( // TODO: Skip for now if the vtable symbol was an alias to a function, // need to evaluate whether it would be correct to analyze the aliasee // function for this optimization. - auto Fn = dyn_cast<Function>(Target.Fn); + auto *Fn = dyn_cast<Function>(Target.Fn); if (!Fn) return false; if (Fn->isDeclaration() || - !computeFunctionBodyMemoryAccess(*Fn, AARGetter(*Fn)) + !computeFunctionBodyMemoryAccess(*Fn, FAM.getResult<AAManager>(*Fn)) .doesNotAccessMemory() || Fn->arg_empty() || !Fn->arg_begin()->use_empty() || Fn->getReturnType() != RetType) @@ -1992,11 +2011,11 @@ void DevirtModule::rebuildGlobal(VTableBits &B) { // Build an anonymous global containing the before bytes, followed by the // original initializer, followed by the after bytes. - auto NewInit = ConstantStruct::getAnon( + auto *NewInit = ConstantStruct::getAnon( {ConstantDataArray::get(M.getContext(), B.Before.Bytes), B.GV->getInitializer(), ConstantDataArray::get(M.getContext(), B.After.Bytes)}); - auto NewGV = + auto *NewGV = new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(), GlobalVariable::PrivateLinkage, NewInit, "", B.GV); NewGV->setSection(B.GV->getSection()); @@ -2009,7 +2028,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) { // Build an alias named after the original global, pointing at the second // element (the original initializer). - auto Alias = GlobalAlias::create( + auto *Alias = GlobalAlias::create( B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "", ConstantExpr::getInBoundsGetElementPtr( NewInit->getType(), NewGV, @@ -2050,7 +2069,7 @@ void DevirtModule::scanTypeTestUsers( // Search for virtual calls based on %p and add them to DevirtCalls. SmallVector<DevirtCallSite, 1> DevirtCalls; SmallVector<CallInst *, 1> Assumes; - auto &DT = LookupDomTree(*CI->getFunction()); + auto &DT = FAM.getResult<DominatorTreeAnalysis>(*CI->getFunction()); findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT); Metadata *TypeId = @@ -2127,7 +2146,7 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) { SmallVector<Instruction *, 1> LoadedPtrs; SmallVector<Instruction *, 1> Preds; bool HasNonCallUses = false; - auto &DT = LookupDomTree(*CI->getFunction()); + auto &DT = FAM.getResult<DominatorTreeAnalysis>(*CI->getFunction()); findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds, HasNonCallUses, CI, DT); @@ -2259,18 +2278,18 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) { // The type of the function is irrelevant, because it's bitcast at calls // anyhow. - Constant *JT = cast<Constant>( + auto *JT = cast<Function>( M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"), Type::getVoidTy(M.getContext())) .getCallee()); bool IsExported = false; - applyICallBranchFunnel(SlotInfo, JT, IsExported); + applyICallBranchFunnel(SlotInfo, *JT, IsExported); assert(!IsExported); } } void DevirtModule::removeRedundantTypeTests() { - auto True = ConstantInt::getTrue(M.getContext()); + auto *True = ConstantInt::getTrue(M.getContext()); for (auto &&U : NumUnsafeUsesForTypeTest) { if (U.second == 0) { U.first->replaceAllUsesWith(True); @@ -2490,18 +2509,17 @@ bool DevirtModule::run() { // Generate remarks for each devirtualized function. for (const auto &DT : DevirtTargets) { GlobalValue *GV = DT.second; - auto F = dyn_cast<Function>(GV); + auto *F = dyn_cast<Function>(GV); if (!F) { - auto A = dyn_cast<GlobalAlias>(GV); + auto *A = dyn_cast<GlobalAlias>(GV); assert(A && isa<Function>(A->getAliasee())); F = dyn_cast<Function>(A->getAliasee()); assert(F); } using namespace ore; - OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F) - << "devirtualized " - << NV("FunctionName", DT.first)); + OREGetter(*F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F) + << "devirtualized " << NV("FunctionName", DT.first)); } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index d934638c15e7..f9155cc66031 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -2115,6 +2115,7 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) { } // Find common base and collect RHS GEPs. + bool First = true; while (true) { if (Ptrs.contains(RHS)) { Base.Ptr = RHS; @@ -2123,7 +2124,12 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) { if (auto *GEP = dyn_cast<GEPOperator>(RHS)) { Base.RHSGEPs.push_back(GEP); - Base.RHSNW &= GEP->getNoWrapFlags(); + if (First) { + First = false; + Base.RHSNW = GEP->getNoWrapFlags(); + } else { + Base.RHSNW = Base.RHSNW.intersectForOffsetAdd(GEP->getNoWrapFlags()); + } RHS = GEP->getPointerOperand(); } else { // No common base. @@ -2132,13 +2138,19 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) { } // Collect LHS GEPs. + First = true; while (true) { if (LHS == Base.Ptr) break; auto *GEP = cast<GEPOperator>(LHS); Base.LHSGEPs.push_back(GEP); - Base.LHSNW &= GEP->getNoWrapFlags(); + if (First) { + First = false; + Base.LHSNW = GEP->getNoWrapFlags(); + } else { + Base.LHSNW = Base.LHSNW.intersectForOffsetAdd(GEP->getNoWrapFlags()); + } LHS = GEP->getPointerOperand(); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index a13d3ceb6132..2d7524e8018b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1799,16 +1799,21 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast, // type may provide more information to later folds, and the smaller logic // instruction may be cheaper (particularly in the case of vectors). Value *X; + auto &DL = IC.getDataLayout(); if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) { - if (Constant *TruncC = IC.getLosslessUnsignedTrunc(C, SrcTy)) { + PreservedCastFlags Flags; + if (Constant *TruncC = getLosslessUnsignedTrunc(C, SrcTy, DL, &Flags)) { // LogicOpc (zext X), C --> zext (LogicOpc X, C) Value *NewOp = IC.Builder.CreateBinOp(LogicOpc, X, TruncC); - return new ZExtInst(NewOp, DestTy); + auto *ZExt = new ZExtInst(NewOp, DestTy); + ZExt->setNonNeg(Flags.NNeg); + ZExt->andIRFlags(Cast); + return ZExt; } } if (match(Cast, m_OneUse(m_SExtLike(m_Value(X))))) { - if (Constant *TruncC = IC.getLosslessSignedTrunc(C, SrcTy)) { + if (Constant *TruncC = getLosslessSignedTrunc(C, SrcTy, DL)) { // LogicOpc (sext X), C --> sext (LogicOpc X, C) Value *NewOp = IC.Builder.CreateBinOp(LogicOpc, X, TruncC); return new SExtInst(NewOp, DestTy); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 42b65dde6725..33b66aeaffe6 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1956,7 +1956,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Constant *C; if (match(I0, m_ZExt(m_Value(X))) && match(I1, m_Constant(C)) && I0->hasOneUse()) { - if (Constant *NarrowC = getLosslessUnsignedTrunc(C, X->getType())) { + if (Constant *NarrowC = getLosslessUnsignedTrunc(C, X->getType(), DL)) { Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, NarrowC); return CastInst::Create(Instruction::ZExt, NarrowMaxMin, II->getType()); } @@ -2006,7 +2006,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Constant *C; if (match(I0, m_SExt(m_Value(X))) && match(I1, m_Constant(C)) && I0->hasOneUse()) { - if (Constant *NarrowC = getLosslessSignedTrunc(C, X->getType())) { + if (Constant *NarrowC = getLosslessSignedTrunc(C, X->getType(), DL)) { Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, NarrowC); return CastInst::Create(Instruction::SExt, NarrowMaxMin, II->getType()); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index fdef49e310f8..ccf918f0b6db 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -11,11 +11,13 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Value.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include <optional> @@ -969,6 +971,25 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { Changed = true; } + const APInt *C1; + Value *V1; + // OP = { lshr, ashr } + // trunc ( OP i8 C1, V1) to i1 -> icmp eq V1, log_2(C1) iff C1 is power of 2 + if (DestWidth == 1 && match(Src, m_Shr(m_Power2(C1), m_Value(V1)))) { + Value *Right = ConstantInt::get(V1->getType(), C1->countr_zero()); + Value *Icmp = Builder.CreateICmpEQ(V1, Right); + return replaceInstUsesWith(Trunc, Icmp); + } + + // OP = { lshr, ashr } + // trunc ( OP i8 C1, V1) to i1 -> icmp ult V1, log_2(C1 + 1) iff (C1 + 1) is + // power of 2 + if (DestWidth == 1 && match(Src, m_Shr(m_LowBitMask(C1), m_Value(V1)))) { + Value *Right = ConstantInt::get(V1->getType(), C1->countr_one()); + Value *Icmp = Builder.CreateICmpULT(V1, Right); + return replaceInstUsesWith(Trunc, Icmp); + } + return Changed ? &Trunc : nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 3a8e04303815..99ea04816681 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/Utils/Local.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/ConstantRange.h" @@ -110,75 +111,41 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) { /// If AndCst is non-null, then the loaded value is masked with that constant /// before doing the comparison. This handles cases like "A[i]&4 == 0". Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( - LoadInst *LI, GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI, - ConstantInt *AndCst) { - if (LI->isVolatile() || LI->getType() != GEP->getResultElementType() || - !GV->getValueType()->isArrayTy() || !GV->isConstant() || + LoadInst *LI, GetElementPtrInst *GEP, CmpInst &ICI, ConstantInt *AndCst) { + auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(GEP)); + if (LI->isVolatile() || !GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) return nullptr; - Type *GEPSrcEltTy = GEP->getSourceElementType(); - if (GEPSrcEltTy->isArrayTy()) - GEPSrcEltTy = GEPSrcEltTy->getArrayElementType(); - if (GV->getValueType()->getArrayElementType() != GEPSrcEltTy) + Type *EltTy = LI->getType(); + TypeSize EltSize = DL.getTypeStoreSize(EltTy); + if (EltSize.isScalable()) return nullptr; - Constant *Init = GV->getInitializer(); - if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init)) + LinearExpression Expr = decomposeLinearExpression(DL, GEP); + if (!Expr.Index || Expr.BasePtr != GV || Expr.Offset.getBitWidth() > 64) return nullptr; - uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); - // Don't blow up on huge arrays. - if (ArrayElementCount > MaxArraySizeForCombine) - return nullptr; + Constant *Init = GV->getInitializer(); + TypeSize GlobalSize = DL.getTypeAllocSize(Init->getType()); - // There are many forms of this optimization we can handle, for now, just do - // the simple index into a single-dimensional array or elements of equal size. - // - // Require: GEP [n x i8] GV, 0, Idx {{, constant indices}} - // Or: GEP i8 GV, Idx + Value *Idx = Expr.Index; + const APInt &Stride = Expr.Scale; + const APInt &ConstOffset = Expr.Offset; - unsigned GEPIdxOp = 1; - if (GEP->getSourceElementType()->isArrayTy()) { - GEPIdxOp = 2; - if (!match(GEP->getOperand(1), m_ZeroInt())) - return nullptr; - } - if (GEP->getNumOperands() < GEPIdxOp + 1 || - isa<Constant>(GEP->getOperand(GEPIdxOp))) + // Allow an additional context offset, but only within the stride. + if (!ConstOffset.ult(Stride)) return nullptr; - // Check that indices after the variable are constants and in-range for the - // type they index. Collect the indices. This is typically for arrays of - // structs. - SmallVector<unsigned, 4> LaterIndices; - - Type *EltTy = Init->getType()->getArrayElementType(); - for (unsigned i = GEPIdxOp + 1, e = GEP->getNumOperands(); i != e; ++i) { - ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i)); - if (!Idx) - return nullptr; // Variable index. - - uint64_t IdxVal = Idx->getZExtValue(); - if ((unsigned)IdxVal != IdxVal) - return nullptr; // Too large array index. - - if (StructType *STy = dyn_cast<StructType>(EltTy)) - EltTy = STy->getElementType(IdxVal); - else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) { - if (IdxVal >= ATy->getNumElements()) - return nullptr; - EltTy = ATy->getElementType(); - } else { - return nullptr; // Unknown type. - } - - LaterIndices.push_back(IdxVal); - } + // Don't handle overlapping loads for now. + if (!Stride.uge(EltSize.getFixedValue())) + return nullptr; - Value *Idx = GEP->getOperand(GEPIdxOp); - // If the index type is non-canonical, wait for it to be canonicalized. - if (Idx->getType() != DL.getIndexType(GEP->getType())) + // Don't blow up on huge arrays. + uint64_t ArrayElementCount = + divideCeil((GlobalSize.getFixedValue() - ConstOffset.getZExtValue()), + Stride.getZExtValue()); + if (ArrayElementCount > MaxArraySizeForCombine) return nullptr; enum { Overdefined = -3, Undefined = -2 }; @@ -211,18 +178,12 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( // Scan the array and see if one of our patterns matches. Constant *CompareRHS = cast<Constant>(ICI.getOperand(1)); - for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) { - Constant *Elt = Init->getAggregateElement(i); + APInt Offset = ConstOffset; + for (unsigned i = 0, e = ArrayElementCount; i != e; ++i, Offset += Stride) { + Constant *Elt = ConstantFoldLoadFromConst(Init, EltTy, Offset, DL); if (!Elt) return nullptr; - // If this is indexing an array of structures, get the structure element. - if (!LaterIndices.empty()) { - Elt = ConstantFoldExtractValueInstruction(Elt, LaterIndices); - if (!Elt) - return nullptr; - } - // If the element is masked, handle it. if (AndCst) { Elt = ConstantFoldBinaryOpOperands(Instruction::And, Elt, AndCst, DL); @@ -309,19 +270,17 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( // Now that we've scanned the entire array, emit our new comparison(s). We // order the state machines in complexity of the generated code. - // If inbounds keyword is not present, Idx * ElementSize can overflow. - // Let's assume that ElementSize is 2 and the wanted value is at offset 0. + // If inbounds keyword is not present, Idx * Stride can overflow. + // Let's assume that Stride is 2 and the wanted value is at offset 0. // Then, there are two possible values for Idx to match offset 0: // 0x00..00, 0x80..00. // Emitting 'icmp eq Idx, 0' isn't correct in this case because the // comparison is false if Idx was 0x80..00. // We need to erase the highest countTrailingZeros(ElementSize) bits of Idx. - unsigned ElementSize = - DL.getTypeAllocSize(Init->getType()->getArrayElementType()); auto MaskIdx = [&](Value *Idx) { - if (!GEP->isInBounds() && llvm::countr_zero(ElementSize) != 0) { + if (!Expr.Flags.isInBounds() && Stride.countr_zero() != 0) { Value *Mask = Constant::getAllOnesValue(Idx->getType()); - Mask = Builder.CreateLShr(Mask, llvm::countr_zero(ElementSize)); + Mask = Builder.CreateLShr(Mask, Stride.countr_zero()); Idx = Builder.CreateAnd(Idx, Mask); } return Idx; @@ -1997,10 +1956,8 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp, if (auto *C2 = dyn_cast<ConstantInt>(Y)) if (auto *LI = dyn_cast<LoadInst>(X)) if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) - if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) - if (Instruction *Res = - foldCmpLoadFromIndexedGlobal(LI, GEP, GV, Cmp, C2)) - return Res; + if (Instruction *Res = foldCmpLoadFromIndexedGlobal(LI, GEP, Cmp, C2)) + return Res; if (!Cmp.isEquality()) return nullptr; @@ -4353,10 +4310,9 @@ Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) { // Try to optimize things like "A[i] > 4" to index computations. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) - if (Instruction *Res = - foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, GV, I)) - return Res; + if (Instruction *Res = + foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, I)) + return Res; break; } @@ -6375,7 +6331,7 @@ Instruction *InstCombinerImpl::foldICmpWithZextOrSext(ICmpInst &ICmp) { // If a lossless truncate is possible... Type *SrcTy = CastOp0->getSrcTy(); - Constant *Res = getLosslessTrunc(C, SrcTy, CastOp0->getOpcode()); + Constant *Res = getLosslessInvCast(C, SrcTy, CastOp0->getOpcode(), DL); if (Res) { if (ICmp.isEquality()) return new ICmpInst(ICmp.getPredicate(), X, Res); @@ -8837,10 +8793,9 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { break; case Instruction::Load: if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) - if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) - if (Instruction *Res = foldCmpLoadFromIndexedGlobal( - cast<LoadInst>(LHSI), GEP, GV, I)) - return Res; + if (Instruction *Res = + foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, I)) + return Res; break; case Instruction::FPTrunc: if (Instruction *NV = foldFCmpFpTrunc(I, *LHSI, *RHSC)) @@ -8944,14 +8899,14 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { } { - Value *CanonLHS = nullptr, *CanonRHS = nullptr; + Value *CanonLHS = nullptr; match(Op0, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonLHS))); - match(Op1, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonRHS))); - // (canonicalize(x) == x) => (x == x) if (CanonLHS == Op1) return new FCmpInst(Pred, Op1, Op1, "", &I); + Value *CanonRHS = nullptr; + match(Op1, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonRHS))); // (x == canonicalize(x)) => (x == x) if (CanonRHS == Op0) return new FCmpInst(Pred, Op0, Op0, "", &I); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 2340028ce93d..7a979c16da50 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -222,23 +222,6 @@ public: bool fmulByZeroIsZero(Value *MulVal, FastMathFlags FMF, const Instruction *CtxI) const; - Constant *getLosslessTrunc(Constant *C, Type *TruncTy, unsigned ExtOp) { - Constant *TruncC = ConstantExpr::getTrunc(C, TruncTy); - Constant *ExtTruncC = - ConstantFoldCastOperand(ExtOp, TruncC, C->getType(), DL); - if (ExtTruncC && ExtTruncC == C) - return TruncC; - return nullptr; - } - - Constant *getLosslessUnsignedTrunc(Constant *C, Type *TruncTy) { - return getLosslessTrunc(C, TruncTy, Instruction::ZExt); - } - - Constant *getLosslessSignedTrunc(Constant *C, Type *TruncTy) { - return getLosslessTrunc(C, TruncTy, Instruction::SExt); - } - std::optional<std::pair<Intrinsic::ID, SmallVector<Value *, 3>>> convertOrOfShiftsToFunnelShift(Instruction &Or); @@ -710,7 +693,7 @@ public: bool foldAllocaCmp(AllocaInst *Alloca); Instruction *foldCmpLoadFromIndexedGlobal(LoadInst *LI, GetElementPtrInst *GEP, - GlobalVariable *GV, CmpInst &ICI, + CmpInst &ICI, ConstantInt *AndCst = nullptr); Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI, Constant *RHSC); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index d7310b1c741c..a9aacc707cc2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -1642,10 +1642,11 @@ static Instruction *narrowUDivURem(BinaryOperator &I, } Constant *C; + auto &DL = IC.getDataLayout(); if (isa<Instruction>(N) && match(N, m_OneUse(m_ZExt(m_Value(X)))) && match(D, m_Constant(C))) { // If the constant is the same in the smaller type, use the narrow version. - Constant *TruncC = IC.getLosslessUnsignedTrunc(C, X->getType()); + Constant *TruncC = getLosslessUnsignedTrunc(C, X->getType(), DL); if (!TruncC) return nullptr; @@ -1656,7 +1657,7 @@ static Instruction *narrowUDivURem(BinaryOperator &I, if (isa<Instruction>(D) && match(D, m_OneUse(m_ZExt(m_Value(X)))) && match(N, m_Constant(C))) { // If the constant is the same in the smaller type, use the narrow version. - Constant *TruncC = IC.getLosslessUnsignedTrunc(C, X->getType()); + Constant *TruncC = getLosslessUnsignedTrunc(C, X->getType(), DL); if (!TruncC) return nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 6477141ab095..ed9a0be6981f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -841,7 +841,7 @@ Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) { NumZexts++; } else if (auto *C = dyn_cast<Constant>(V)) { // Make sure that constants can fit in the new type. - Constant *Trunc = getLosslessUnsignedTrunc(C, NarrowType); + Constant *Trunc = getLosslessUnsignedTrunc(C, NarrowType, DL); if (!Trunc) return nullptr; NewIncoming.push_back(Trunc); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index eb4332fbc095..9467463d39c0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1993,6 +1993,63 @@ Value *InstCombinerImpl::foldSelectWithConstOpToBinOp(ICmpInst *Cmp, return BinOp; } +/// Folds: +/// %a_sub = call @llvm.usub.sat(x, IntConst1) +/// %b_sub = call @llvm.usub.sat(y, IntConst2) +/// %or = or %a_sub, %b_sub +/// %cmp = icmp eq %or, 0 +/// %sel = select %cmp, 0, MostSignificantBit +/// into: +/// %a_sub' = usub.sat(x, IntConst1 - MostSignificantBit) +/// %b_sub' = usub.sat(y, IntConst2 - MostSignificantBit) +/// %or = or %a_sub', %b_sub' +/// %and = and %or, MostSignificantBit +/// Likewise, for vector arguments as well. +static Instruction *foldICmpUSubSatWithAndForMostSignificantBitCmp( + SelectInst &SI, ICmpInst *ICI, InstCombiner::BuilderTy &Builder) { + if (!SI.hasOneUse() || !ICI->hasOneUse()) + return nullptr; + CmpPredicate Pred; + Value *A, *B; + const APInt *Constant1, *Constant2; + if (!match(SI.getCondition(), + m_ICmp(Pred, + m_OneUse(m_Or(m_OneUse(m_Intrinsic<Intrinsic::usub_sat>( + m_Value(A), m_APInt(Constant1))), + m_OneUse(m_Intrinsic<Intrinsic::usub_sat>( + m_Value(B), m_APInt(Constant2))))), + m_Zero()))) + return nullptr; + + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + if (!(Pred == ICmpInst::ICMP_EQ && + (match(TrueVal, m_Zero()) && match(FalseVal, m_SignMask()))) || + (Pred == ICmpInst::ICMP_NE && + (match(TrueVal, m_SignMask()) && match(FalseVal, m_Zero())))) + return nullptr; + + auto *Ty = A->getType(); + unsigned BW = Constant1->getBitWidth(); + APInt MostSignificantBit = APInt::getSignMask(BW); + + // Anything over MSB is negative + if (Constant1->isNonNegative() || Constant2->isNonNegative()) + return nullptr; + + APInt AdjAP1 = *Constant1 - MostSignificantBit + 1; + APInt AdjAP2 = *Constant2 - MostSignificantBit + 1; + + auto *Adj1 = ConstantInt::get(Ty, AdjAP1); + auto *Adj2 = ConstantInt::get(Ty, AdjAP2); + + Value *NewA = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, A, Adj1); + Value *NewB = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, B, Adj2); + Value *Or = Builder.CreateOr(NewA, NewB); + Constant *MSBConst = ConstantInt::get(Ty, MostSignificantBit); + return BinaryOperator::CreateAnd(Or, MSBConst); +} + /// Visit a SelectInst that has an ICmpInst as its first operand. Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI) { @@ -2009,6 +2066,9 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, if (Instruction *NewSel = tryToReuseConstantFromSelectInComparison(SI, *ICI, *this)) return NewSel; + if (Instruction *Folded = + foldICmpUSubSatWithAndForMostSignificantBitCmp(SI, ICI, Builder)) + return Folded; // NOTE: if we wanted to, this is where to detect integer MIN/MAX bool Changed = false; @@ -2315,7 +2375,7 @@ Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) { // If the constant is the same after truncation to the smaller type and // extension to the original type, we can narrow the select. Type *SelType = Sel.getType(); - Constant *TruncC = getLosslessTrunc(C, SmallType, ExtOpcode); + Constant *TruncC = getLosslessInvCast(C, SmallType, ExtOpcode, DL); if (TruncC && ExtInst->hasOneUse()) { Value *TruncCVal = cast<Value>(TruncC); if (ExtInst == Sel.getFalseValue()) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index f17fecd430a6..aa030294ff1e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -795,8 +795,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, I->dropPoisonGeneratingFlags(); return I; } - Known.Zero.lshrInPlace(ShiftAmt); - Known.One.lshrInPlace(ShiftAmt); + Known >>= ShiftAmt; if (ShiftAmt) Known.Zero.setHighBits(ShiftAmt); // high bits known zero. } else { @@ -1066,10 +1065,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, } } - Known.Zero = LHSKnown.Zero.shl(ShiftAmt) | - RHSKnown.Zero.lshr(BitWidth - ShiftAmt); - Known.One = LHSKnown.One.shl(ShiftAmt) | - RHSKnown.One.lshr(BitWidth - ShiftAmt); + LHSKnown <<= ShiftAmt; + RHSKnown >>= BitWidth - ShiftAmt; + Known = LHSKnown.unionWith(RHSKnown); KnownBitsComputed = true; break; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 5ee3bb1abe86..c2f045a2ab02 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2027,9 +2027,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, } if (OneUse) { - replaceAllDbgUsesWith(const_cast<PHINode &>(*PN), - const_cast<PHINode &>(*NewPN), - const_cast<PHINode &>(*PN), DT); + replaceAllDbgUsesWith(*PN, *NewPN, *PN, DT); } return replaceInstUsesWith(I, NewPN); } @@ -2570,7 +2568,7 @@ Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) { Constant *WideC; if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC))) return nullptr; - Constant *NarrowC = getLosslessTrunc(WideC, X->getType(), CastOpc); + Constant *NarrowC = getLosslessInvCast(WideC, X->getType(), CastOpc, DL); if (!NarrowC) return nullptr; Y = NarrowC; @@ -2676,6 +2674,62 @@ static Instruction *canonicalizeGEPOfConstGEPI8(GetElementPtrInst &GEP, return nullptr; } +/// Combine constant offsets separated by variable offsets. +/// ptradd (ptradd (ptradd p, C1), x), C2 -> ptradd (ptradd p, x), C1+C2 +static Instruction *combineConstantOffsets(GetElementPtrInst &GEP, + InstCombinerImpl &IC) { + if (!GEP.hasAllConstantIndices()) + return nullptr; + + GEPNoWrapFlags NW = GEPNoWrapFlags::all(); + SmallVector<GetElementPtrInst *> Skipped; + auto *InnerGEP = dyn_cast<GetElementPtrInst>(GEP.getPointerOperand()); + while (true) { + if (!InnerGEP) + return nullptr; + + NW = NW.intersectForReassociate(InnerGEP->getNoWrapFlags()); + if (InnerGEP->hasAllConstantIndices()) + break; + + if (!InnerGEP->hasOneUse()) + return nullptr; + + Skipped.push_back(InnerGEP); + InnerGEP = dyn_cast<GetElementPtrInst>(InnerGEP->getPointerOperand()); + } + + // The two constant offset GEPs are directly adjacent: Let normal offset + // merging handle it. + if (Skipped.empty()) + return nullptr; + + // FIXME: This one-use check is not strictly necessary. Consider relaxing it + // if profitable. + if (!InnerGEP->hasOneUse()) + return nullptr; + + // Don't bother with vector splats. + Type *Ty = GEP.getType(); + if (InnerGEP->getType() != Ty) + return nullptr; + + const DataLayout &DL = IC.getDataLayout(); + APInt Offset(DL.getIndexTypeSizeInBits(Ty), 0); + if (!GEP.accumulateConstantOffset(DL, Offset) || + !InnerGEP->accumulateConstantOffset(DL, Offset)) + return nullptr; + + IC.replaceOperand(*Skipped.back(), 0, InnerGEP->getPointerOperand()); + for (GetElementPtrInst *SkippedGEP : Skipped) + SkippedGEP->setNoWrapFlags(NW); + + return IC.replaceInstUsesWith( + GEP, + IC.Builder.CreatePtrAdd(Skipped.front(), IC.Builder.getInt(Offset), "", + NW.intersectForOffsetAdd(GEP.getNoWrapFlags()))); +} + Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP, GEPOperator *Src) { // Combine Indices - If the source pointer to this getelementptr instruction @@ -2687,125 +2741,56 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP, if (auto *I = canonicalizeGEPOfConstGEPI8(GEP, Src, *this)) return I; - // For constant GEPs, use a more general offset-based folding approach. - Type *PtrTy = Src->getType()->getScalarType(); - if (GEP.hasAllConstantIndices() && - (Src->hasOneUse() || Src->hasAllConstantIndices())) { - // Split Src into a variable part and a constant suffix. - gep_type_iterator GTI = gep_type_begin(*Src); - Type *BaseType = GTI.getIndexedType(); - bool IsFirstType = true; - unsigned NumVarIndices = 0; - for (auto Pair : enumerate(Src->indices())) { - if (!isa<ConstantInt>(Pair.value())) { - BaseType = GTI.getIndexedType(); - IsFirstType = false; - NumVarIndices = Pair.index() + 1; - } - ++GTI; - } - - // Determine the offset for the constant suffix of Src. - APInt Offset(DL.getIndexTypeSizeInBits(PtrTy), 0); - if (NumVarIndices != Src->getNumIndices()) { - // FIXME: getIndexedOffsetInType() does not handled scalable vectors. - if (BaseType->isScalableTy()) - return nullptr; - - SmallVector<Value *> ConstantIndices; - if (!IsFirstType) - ConstantIndices.push_back( - Constant::getNullValue(Type::getInt32Ty(GEP.getContext()))); - append_range(ConstantIndices, drop_begin(Src->indices(), NumVarIndices)); - Offset += DL.getIndexedOffsetInType(BaseType, ConstantIndices); - } - - // Add the offset for GEP (which is fully constant). - if (!GEP.accumulateConstantOffset(DL, Offset)) - return nullptr; - - // Convert the total offset back into indices. - SmallVector<APInt> ConstIndices = - DL.getGEPIndicesForOffset(BaseType, Offset); - if (!Offset.isZero() || (!IsFirstType && !ConstIndices[0].isZero())) - return nullptr; - - GEPNoWrapFlags NW = getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP)); - SmallVector<Value *> Indices( - drop_end(Src->indices(), Src->getNumIndices() - NumVarIndices)); - for (const APInt &Idx : drop_begin(ConstIndices, !IsFirstType)) { - Indices.push_back(ConstantInt::get(GEP.getContext(), Idx)); - // Even if the total offset is inbounds, we may end up representing it - // by first performing a larger negative offset, and then a smaller - // positive one. The large negative offset might go out of bounds. Only - // preserve inbounds if all signs are the same. - if (Idx.isNonNegative() != ConstIndices[0].isNonNegative()) - NW = NW.withoutNoUnsignedSignedWrap(); - if (!Idx.isNonNegative()) - NW = NW.withoutNoUnsignedWrap(); - } - - return replaceInstUsesWith( - GEP, Builder.CreateGEP(Src->getSourceElementType(), Src->getOperand(0), - Indices, "", NW)); - } + if (auto *I = combineConstantOffsets(GEP, *this)) + return I; if (Src->getResultElementType() != GEP.getSourceElementType()) return nullptr; - SmallVector<Value*, 8> Indices; - // Find out whether the last index in the source GEP is a sequential idx. bool EndsWithSequential = false; for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src); I != E; ++I) EndsWithSequential = I.isSequential(); + if (!EndsWithSequential) + return nullptr; - // Can we combine the two pointer arithmetics offsets? - if (EndsWithSequential) { - // Replace: gep (gep %P, long B), long A, ... - // With: T = long A+B; gep %P, T, ... - Value *SO1 = Src->getOperand(Src->getNumOperands()-1); - Value *GO1 = GEP.getOperand(1); - - // If they aren't the same type, then the input hasn't been processed - // by the loop above yet (which canonicalizes sequential index types to - // intptr_t). Just avoid transforming this until the input has been - // normalized. - if (SO1->getType() != GO1->getType()) - return nullptr; + // Replace: gep (gep %P, long B), long A, ... + // With: T = long A+B; gep %P, T, ... + Value *SO1 = Src->getOperand(Src->getNumOperands() - 1); + Value *GO1 = GEP.getOperand(1); - Value *Sum = - simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); - // Only do the combine when we are sure the cost after the - // merge is never more than that before the merge. - if (Sum == nullptr) - return nullptr; + // If they aren't the same type, then the input hasn't been processed + // by the loop above yet (which canonicalizes sequential index types to + // intptr_t). Just avoid transforming this until the input has been + // normalized. + if (SO1->getType() != GO1->getType()) + return nullptr; - Indices.append(Src->op_begin()+1, Src->op_end()-1); - Indices.push_back(Sum); - Indices.append(GEP.op_begin()+2, GEP.op_end()); - } else if (isa<Constant>(*GEP.idx_begin()) && - cast<Constant>(*GEP.idx_begin())->isNullValue() && - Src->getNumOperands() != 1) { - // Otherwise we can do the fold if the first index of the GEP is a zero - Indices.append(Src->op_begin()+1, Src->op_end()); - Indices.append(GEP.idx_begin()+1, GEP.idx_end()); - } - - // Don't create GEPs with more than one variable index. - unsigned NumVarIndices = - count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); }); - if (NumVarIndices > 1) + Value *Sum = + simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); + // Only do the combine when we are sure the cost after the + // merge is never more than that before the merge. + if (Sum == nullptr) return nullptr; - if (!Indices.empty()) - return replaceInstUsesWith( - GEP, Builder.CreateGEP( - Src->getSourceElementType(), Src->getOperand(0), Indices, "", - getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP)))); + SmallVector<Value *, 8> Indices; + Indices.append(Src->op_begin() + 1, Src->op_end() - 1); + Indices.push_back(Sum); + Indices.append(GEP.op_begin() + 2, GEP.op_end()); - return nullptr; + // Don't create GEPs with more than one non-zero index. + unsigned NumNonZeroIndices = count_if(Indices, [](Value *Idx) { + auto *C = dyn_cast<Constant>(Idx); + return !C || !C->isNullValue(); + }); + if (NumNonZeroIndices > 1) + return nullptr; + + return replaceInstUsesWith( + GEP, Builder.CreateGEP( + Src->getSourceElementType(), Src->getOperand(0), Indices, "", + getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP)))); } Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses, @@ -3238,6 +3223,19 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { drop_end(Indices), "", GEP.getNoWrapFlags())); } + // Strip leading zero indices. + auto *FirstIdx = dyn_cast<Constant>(Indices.front()); + if (FirstIdx && FirstIdx->isNullValue() && + !FirstIdx->getType()->isVectorTy()) { + gep_type_iterator GTI = gep_type_begin(GEP); + ++GTI; + if (!GTI.isStruct()) + return replaceInstUsesWith(GEP, Builder.CreateGEP(GTI.getIndexedType(), + GEP.getPointerOperand(), + drop_begin(Indices), "", + GEP.getNoWrapFlags())); + } + // Scalarize vector operands; prefer splat-of-gep.as canonical form. // Note that this looses information about undef lanes; we run it after // demanded bits to partially mitigate that loss. @@ -3264,17 +3262,18 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return replaceInstUsesWith(GEP, Res); } - bool SeenVarIndex = false; + bool SeenNonZeroIndex = false; for (auto [IdxNum, Idx] : enumerate(Indices)) { - if (isa<Constant>(Idx)) + auto *C = dyn_cast<Constant>(Idx); + if (C && C->isNullValue()) continue; - if (!SeenVarIndex) { - SeenVarIndex = true; + if (!SeenNonZeroIndex) { + SeenNonZeroIndex = true; continue; } - // GEP has multiple variable indices: Split it. + // GEP has multiple non-zero indices: Split it. ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum); Value *FrontGEP = Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices, @@ -4961,63 +4960,68 @@ Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) { Value * InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) { // Try to push freeze through instructions that propagate but don't produce - // poison as far as possible. If an operand of freeze follows three - // conditions 1) one-use, 2) does not produce poison, and 3) has all but one - // guaranteed-non-poison operands then push the freeze through to the one - // operand that is not guaranteed non-poison. The actual transform is as - // follows. - // Op1 = ... ; Op1 can be posion - // Op0 = Inst(Op1, NonPoisonOps...) ; Op0 has only one use and only have - // ; single guaranteed-non-poison operands + // poison as far as possible. If an operand of freeze does not produce poison + // then push the freeze through to the operands that are not guaranteed + // non-poison. The actual transform is as follows. + // Op1 = ... ; Op1 can be poison + // Op0 = Inst(Op1, NonPoisonOps...) // ... = Freeze(Op0) // => // Op1 = ... // Op1.fr = Freeze(Op1) // ... = Inst(Op1.fr, NonPoisonOps...) - auto *OrigOp = OrigFI.getOperand(0); - auto *OrigOpInst = dyn_cast<Instruction>(OrigOp); - // While we could change the other users of OrigOp to use freeze(OrigOp), that - // potentially reduces their optimization potential, so let's only do this iff - // the OrigOp is only used by the freeze. - if (!OrigOpInst || !OrigOpInst->hasOneUse() || isa<PHINode>(OrigOp)) - return nullptr; + auto CanPushFreeze = [](Value *V) { + if (!isa<Instruction>(V) || isa<PHINode>(V)) + return false; - // We can't push the freeze through an instruction which can itself create - // poison. If the only source of new poison is flags, we can simply - // strip them (since we know the only use is the freeze and nothing can - // benefit from them.) - if (canCreateUndefOrPoison(cast<Operator>(OrigOp), - /*ConsiderFlagsAndMetadata*/ false)) - return nullptr; + // We can't push the freeze through an instruction which can itself create + // poison. If the only source of new poison is flags, we can simply + // strip them (since we know the only use is the freeze and nothing can + // benefit from them.) + return !canCreateUndefOrPoison(cast<Operator>(V), + /*ConsiderFlagsAndMetadata*/ false); + }; - // If operand is guaranteed not to be poison, there is no need to add freeze - // to the operand. So we first find the operand that is not guaranteed to be - // poison. - Value *MaybePoisonOperand = nullptr; - for (Value *V : OrigOpInst->operands()) { - if (isa<MetadataAsValue>(V) || isGuaranteedNotToBeUndefOrPoison(V) || - // Treat identical operands as a single operand. - (MaybePoisonOperand && MaybePoisonOperand == V)) + // Pushing freezes up long instruction chains can be expensive. Instead, + // we directly push the freeze all the way to the leaves. However, we leave + // deduplication of freezes on the same value for freezeOtherUses(). + Use *OrigUse = &OrigFI.getOperandUse(0); + SmallPtrSet<Instruction *, 8> Visited; + SmallVector<Use *, 8> Worklist; + Worklist.push_back(OrigUse); + while (!Worklist.empty()) { + auto *U = Worklist.pop_back_val(); + Value *V = U->get(); + if (!CanPushFreeze(V)) { + // If we can't push through the original instruction, abort the transform. + if (U == OrigUse) + return nullptr; + + auto *UserI = cast<Instruction>(U->getUser()); + Builder.SetInsertPoint(UserI); + Value *Frozen = Builder.CreateFreeze(V, V->getName() + ".fr"); + U->set(Frozen); continue; - if (!MaybePoisonOperand) - MaybePoisonOperand = V; - else - return nullptr; - } + } - OrigOpInst->dropPoisonGeneratingAnnotations(); + auto *I = cast<Instruction>(V); + if (!Visited.insert(I).second) + continue; - // If all operands are guaranteed to be non-poison, we can drop freeze. - if (!MaybePoisonOperand) - return OrigOp; + // reverse() to emit freezes in a more natural order. + for (Use &Op : reverse(I->operands())) { + Value *OpV = Op.get(); + if (isa<MetadataAsValue>(OpV) || isGuaranteedNotToBeUndefOrPoison(OpV)) + continue; + Worklist.push_back(&Op); + } - Builder.SetInsertPoint(OrigOpInst); - Value *FrozenMaybePoisonOperand = Builder.CreateFreeze( - MaybePoisonOperand, MaybePoisonOperand->getName() + ".fr"); + I->dropPoisonGeneratingAnnotations(); + this->Worklist.add(I); + } - OrigOpInst->replaceUsesOfWith(MaybePoisonOperand, FrozenMaybePoisonOperand); - return OrigOp; + return OrigUse->get(); } Instruction *InstCombinerImpl::foldFreezeIntoRecurrence(FreezeInst &FI, diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 50258af5e26c..42c3d4a4f4c4 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1219,7 +1219,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout()); // Check that size is known and can be stored in IntptrTy. - if (!Size || !ConstantInt::isValueValidForType(IntptrTy, *Size)) + // TODO: Add support for scalable vectors if possible. + if (!Size || Size->isScalable() || + !ConstantInt::isValueValidForType(IntptrTy, *Size)) return; bool DoPoison = (ID == Intrinsic::lifetime_end); diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 66cdbfcf998c..832592e7663b 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -212,6 +212,15 @@ static cl::opt<float> "OR because of the hot percentile cutoff, if " "both are supplied.")); +static cl::opt<bool> ClStaticLinking( + "hwasan-static-linking", + cl::desc("Don't use .note.hwasan.globals section to instrument globals " + "from loadable libraries. " + "Note: in static binaries, the global variables section can be " + "accessed directly via linker-provided " + "__start_hwasan_globals and __stop_hwasan_globals symbols"), + cl::Hidden, cl::init(false)); + STATISTIC(NumTotalFuncs, "Number of total funcs"); STATISTIC(NumInstrumentedFuncs, "Number of instrumented funcs"); STATISTIC(NumNoProfileSummaryFuncs, "Number of funcs without PS"); @@ -335,6 +344,7 @@ private: FunctionAnalysisManager &FAM) const; void initializeModule(); void createHwasanCtorComdat(); + void createHwasanNote(); void initializeCallbacks(Module &M); @@ -533,20 +543,7 @@ void HWAddressSanitizerPass::printPipeline( OS << '>'; } -void HWAddressSanitizer::createHwasanCtorComdat() { - std::tie(HwasanCtorFunction, std::ignore) = - getOrCreateSanitizerCtorAndInitFunctions( - M, kHwasanModuleCtorName, kHwasanInitName, - /*InitArgTypes=*/{}, - /*InitArgs=*/{}, - // This callback is invoked when the functions are created the first - // time. Hook them into the global ctors list in that case: - [&](Function *Ctor, FunctionCallee) { - Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName); - Ctor->setComdat(CtorComdat); - appendToGlobalCtors(M, Ctor, 0, Ctor); - }); - +void HWAddressSanitizer::createHwasanNote() { // Create a note that contains pointers to the list of global // descriptors. Adding a note to the output file will cause the linker to // create a PT_NOTE program header pointing to the note that we can use to @@ -630,6 +627,29 @@ void HWAddressSanitizer::createHwasanCtorComdat() { appendToCompilerUsed(M, Dummy); } +void HWAddressSanitizer::createHwasanCtorComdat() { + std::tie(HwasanCtorFunction, std::ignore) = + getOrCreateSanitizerCtorAndInitFunctions( + M, kHwasanModuleCtorName, kHwasanInitName, + /*InitArgTypes=*/{}, + /*InitArgs=*/{}, + // This callback is invoked when the functions are created the first + // time. Hook them into the global ctors list in that case: + [&](Function *Ctor, FunctionCallee) { + Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName); + Ctor->setComdat(CtorComdat); + appendToGlobalCtors(M, Ctor, 0, Ctor); + }); + + // Do not create .note.hwasan.globals for static binaries, as it is only + // needed for instrumenting globals from dynamic libraries. In static + // binaries, the global variables section can be accessed directly via the + // __start_hwasan_globals and __stop_hwasan_globals symbols inserted by the + // linker. + if (!ClStaticLinking) + createHwasanNote(); +} + /// Module-level initialization. /// /// inserts a call to __hwasan_init to the module's constructor list. diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index a9a0731f16d9..ecb2f2dbc552 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/ProfileData/DataAccessProf.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ProfileData/MemProfCommon.h" @@ -75,6 +76,10 @@ static cl::opt<unsigned> MinMatchedColdBytePercent( "memprof-matching-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes matched to hint allocation cold")); +static cl::opt<bool> AnnotateStaticDataSectionPrefix( + "memprof-annotate-static-data-prefix", cl::init(false), cl::Hidden, + cl::desc("If true, annotate the static data section prefix")); + // Matching statistics STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile."); STATISTIC(NumOfMemProfMismatch, @@ -90,6 +95,14 @@ STATISTIC(NumOfMemProfMatchedAllocs, "Number of matched memory profile allocs."); STATISTIC(NumOfMemProfMatchedCallSites, "Number of matched memory profile callsites."); +STATISTIC(NumOfMemProfHotGlobalVars, + "Number of global vars annotated with 'hot' section prefix."); +STATISTIC(NumOfMemProfColdGlobalVars, + "Number of global vars annotated with 'unlikely' section prefix."); +STATISTIC(NumOfMemProfUnknownGlobalVars, + "Number of global vars with unknown hotness (no section prefix)."); +STATISTIC(NumOfMemProfExplicitSectionGlobalVars, + "Number of global vars with user-specified section (not annotated)."); static void addCallsiteMetadata(Instruction &I, ArrayRef<uint64_t> InlinedCallStack, @@ -674,11 +687,12 @@ MemProfUsePass::MemProfUsePass(std::string MemoryProfileFile, } PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { - // Return immediately if the module doesn't contain any function. - if (M.empty()) + // Return immediately if the module doesn't contain any function or global + // variables. + if (M.empty() && M.globals().empty()) return PreservedAnalyses::all(); - LLVM_DEBUG(dbgs() << "Read in memory profile:"); + LLVM_DEBUG(dbgs() << "Read in memory profile:\n"); auto &Ctx = M.getContext(); auto ReaderOrErr = IndexedInstrProfReader::create(MemoryProfileFileName, *FS); if (Error E = ReaderOrErr.takeError()) { @@ -703,6 +717,14 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { return PreservedAnalyses::all(); } + const bool Changed = + annotateGlobalVariables(M, MemProfReader->getDataAccessProfileData()); + + // If the module doesn't contain any function, return after we process all + // global variables. + if (M.empty()) + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(*M.begin()); @@ -752,3 +774,95 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { return PreservedAnalyses::none(); } + +// Returns true iff the global variable has custom section either by +// __attribute__((section("name"))) +// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate) +// or #pragma clang section directives +// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section). +static bool hasExplicitSectionName(const GlobalVariable &GVar) { + if (GVar.hasSection()) + return true; + + auto Attrs = GVar.getAttributes(); + if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") || + Attrs.hasAttribute("relro-section") || + Attrs.hasAttribute("rodata-section")) + return true; + return false; +} + +bool MemProfUsePass::annotateGlobalVariables( + Module &M, const memprof::DataAccessProfData *DataAccessProf) { + if (!AnnotateStaticDataSectionPrefix || M.globals().empty()) + return false; + + if (!DataAccessProf) { + M.getContext().diagnose(DiagnosticInfoPGOProfile( + MemoryProfileFileName.data(), + StringRef("Data access profiles not found in memprof. Ignore " + "-memprof-annotate-static-data-prefix."), + DS_Warning)); + return false; + } + + bool Changed = false; + // Iterate all global variables in the module and annotate them based on + // data access profiles. Note it's up to the linker to decide how to map input + // sections to output sections, and one conservative practice is to map + // unlikely-prefixed ones to unlikely output section, and map the rest + // (hot-prefixed or prefix-less) to the canonical output section. + for (GlobalVariable &GVar : M.globals()) { + assert(!GVar.getSectionPrefix().has_value() && + "GVar shouldn't have section prefix yet"); + if (GVar.isDeclarationForLinker()) + continue; + + if (hasExplicitSectionName(GVar)) { + ++NumOfMemProfExplicitSectionGlobalVars; + LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName() + << " has explicit section name. Skip annotating.\n"); + continue; + } + + StringRef Name = GVar.getName(); + // Skip string literals as their mangled names don't stay stable across + // binary releases. + // TODO: Track string content hash in the profiles and compute it inside the + // compiler to categeorize the hotness string literals. + if (Name.starts_with(".str")) { + + LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n"); + continue; + } + + // DataAccessProfRecord's get* methods will canonicalize the name under the + // hood before looking it up, so optimizer doesn't need to do it. + std::optional<DataAccessProfRecord> Record = + DataAccessProf->getProfileRecord(Name); + // Annotate a global variable as hot if it has non-zero sampled count, and + // annotate it as cold if it's seen in the profiled binary + // file but doesn't have any access sample. + // For logging, optimization remark emitter requires a llvm::Function, but + // it's not well defined how to associate a global variable with a function. + // So we just print out the static data section prefix in LLVM_DEBUG. + if (Record && Record->AccessCount > 0) { + ++NumOfMemProfHotGlobalVars; + GVar.setSectionPrefix("hot"); + Changed = true; + LLVM_DEBUG(dbgs() << "Global variable " << Name + << " is annotated as hot\n"); + } else if (DataAccessProf->isKnownColdSymbol(Name)) { + ++NumOfMemProfColdGlobalVars; + GVar.setSectionPrefix("unlikely"); + Changed = true; + LLVM_DEBUG(dbgs() << "Global variable " << Name + << " is annotated as unlikely\n"); + } else { + ++NumOfMemProfUnknownGlobalVars; + LLVM_DEBUG(dbgs() << "Global variable " << Name << " is not annotated\n"); + } + } + + return Changed; +} diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 27292d1a66c3..9899a2aae2b1 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3263,7 +3263,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return true; } - /// Heuristically instrument unknown intrinsics. + /// Returns whether it was able to heuristically instrument unknown + /// intrinsics. /// /// The main purpose of this code is to do something reasonable with all /// random intrinsics we might encounter, most importantly - SIMD intrinsics. @@ -3273,7 +3274,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// /// We special-case intrinsics where this approach fails. See llvm.bswap /// handling as an example of that. - bool handleUnknownIntrinsicUnlogged(IntrinsicInst &I) { + bool maybeHandleUnknownIntrinsicUnlogged(IntrinsicInst &I) { unsigned NumArgOperands = I.arg_size(); if (NumArgOperands == 0) return false; @@ -3300,8 +3301,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return false; } - bool handleUnknownIntrinsic(IntrinsicInst &I) { - if (handleUnknownIntrinsicUnlogged(I)) { + bool maybeHandleUnknownIntrinsic(IntrinsicInst &I) { + if (maybeHandleUnknownIntrinsicUnlogged(I)) { if (ClDumpHeuristicInstructions) dumpInst(I); @@ -3860,7 +3861,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // // Three operands: // <4 x i32> @llvm.x86.avx512.vpdpbusd.128 - // (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b) + // (<4 x i32> %s, <16 x i8> %a, <16 x i8> %b) // (this is equivalent to multiply-add on %a and %b, followed by // adding/"accumulating" %s. "Accumulation" stores the result in one // of the source registers, but this accumulate vs. add distinction @@ -3902,15 +3903,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { ReturnType->getPrimitiveSizeInBits()); if (I.arg_size() == 3) { - assert(ParamType == ReturnType); - assert(ParamType == I.getArgOperand(0)->getType()); + [[maybe_unused]] auto *AccumulatorType = + cast<FixedVectorType>(I.getOperand(0)->getType()); + assert(AccumulatorType == ReturnType); } FixedVectorType *ImplicitReturnType = ReturnType; // Step 1: instrument multiplication of corresponding vector elements if (EltSizeInBits) { - ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy( - EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits())); + ImplicitReturnType = cast<FixedVectorType>( + getMMXVectorTy(EltSizeInBits * ReductionFactor, + ParamType->getPrimitiveSizeInBits())); ParamType = cast<FixedVectorType>( getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits())); @@ -3958,7 +3961,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Step 2: instrument horizontal add // We don't need bit-precise horizontalReduce because we only want to check - // if each pair of elements is fully zero. + // if each pair/quad of elements is fully zero. // Cast to <4 x i32>. Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType); @@ -3968,7 +3971,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Constant::getNullValue(Horizontal->getType())), ImplicitReturnType); - // Cast it back to the required fake return type (<1 x i64>). + // Cast it back to the required fake return type (if MMX: <1 x i64>; for + // AVX, it is already correct). if (EltSizeInBits) OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I)); @@ -5262,7 +5266,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleShadowOr(I); } - void visitIntrinsicInst(IntrinsicInst &I) { + bool maybeHandleCrossPlatformIntrinsic(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: @@ -5342,6 +5346,32 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleVectorReduceWithStarterIntrinsic(I); break; + case Intrinsic::scmp: + case Intrinsic::ucmp: { + handleShadowOr(I); + break; + } + + case Intrinsic::fshl: + case Intrinsic::fshr: + handleFunnelShift(I); + break; + + case Intrinsic::is_constant: + // The result of llvm.is.constant() is always defined. + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + break; + + default: + return false; + } + + return true; + } + + bool maybeHandleX86SIMDIntrinsic(IntrinsicInst &I) { + switch (I.getIntrinsicID()) { case Intrinsic::x86_sse_stmxcsr: handleStmxcsr(I); break; @@ -5392,6 +5422,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { break; } + // Convert Packed Single Precision Floating-Point Values + // to Packed Signed Doubleword Integer Values + // + // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512 + // (<16 x float>, <16 x i32>, i16, i32) + case Intrinsic::x86_avx512_mask_cvtps2dq_512: + handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false); + break; + // Convert Packed Double Precision Floating-Point Values // to Packed Single Precision Floating-Point Values case Intrinsic::x86_sse2_cvtpd2ps: @@ -5492,23 +5531,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_mmx_psrli_q: case Intrinsic::x86_mmx_psrai_w: case Intrinsic::x86_mmx_psrai_d: - case Intrinsic::aarch64_neon_rshrn: - case Intrinsic::aarch64_neon_sqrshl: - case Intrinsic::aarch64_neon_sqrshrn: - case Intrinsic::aarch64_neon_sqrshrun: - case Intrinsic::aarch64_neon_sqshl: - case Intrinsic::aarch64_neon_sqshlu: - case Intrinsic::aarch64_neon_sqshrn: - case Intrinsic::aarch64_neon_sqshrun: - case Intrinsic::aarch64_neon_srshl: - case Intrinsic::aarch64_neon_sshl: - case Intrinsic::aarch64_neon_uqrshl: - case Intrinsic::aarch64_neon_uqrshrn: - case Intrinsic::aarch64_neon_uqshl: - case Intrinsic::aarch64_neon_uqshrn: - case Intrinsic::aarch64_neon_urshl: - case Intrinsic::aarch64_neon_ushl: - // Not handled here: aarch64_neon_vsli (vector shift left and insert) handleVectorShiftIntrinsic(I, /* Variable */ false); break; case Intrinsic::x86_avx2_psllv_d: @@ -5621,19 +5643,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // // Multiply and Add Packed Signed and Unsigned Bytes // < 4 x i32> @llvm.x86.avx512.vpdpbusd.128 - // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // (< 4 x i32>, <16 x i8>, <16 x i8>) // < 8 x i32> @llvm.x86.avx512.vpdpbusd.256 - // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // (< 8 x i32>, <32 x i8>, <32 x i8>) // <16 x i32> @llvm.x86.avx512.vpdpbusd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>) + // (<16 x i32>, <64 x i8>, <64 x i8>) // // Multiply and Add Unsigned and Signed Bytes With Saturation // < 4 x i32> @llvm.x86.avx512.vpdpbusds.128 - // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // (< 4 x i32>, <16 x i8>, <16 x i8>) // < 8 x i32> @llvm.x86.avx512.vpdpbusds.256 - // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // (< 8 x i32>, <32 x i8>, <32 x i8>) // <16 x i32> @llvm.x86.avx512.vpdpbusds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>) + // (<16 x i32>, <64 x i8>, <64 x i8>) // // < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 // (< 4 x i32>, < 4 x i32>, < 4 x i32>) @@ -5652,30 +5674,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // // These intrinsics are auto-upgraded into non-masked forms: // <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) // // <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) case Intrinsic::x86_avx512_vpdpbusd_128: case Intrinsic::x86_avx512_vpdpbusd_256: case Intrinsic::x86_avx512_vpdpbusd_512: @@ -5930,7 +5952,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx512_max_pd_512: { // These AVX512 variants contain the rounding mode as a trailing flag. // Earlier variants do not have a trailing flag and are already handled - // by maybeHandleSimpleNomemIntrinsic(I, 0) via handleUnknownIntrinsic. + // by maybeHandleSimpleNomemIntrinsic(I, 0) via + // maybeHandleUnknownIntrinsic. [[maybe_unused]] bool Success = maybeHandleSimpleNomemIntrinsic(I, /*trailingFlags=*/1); assert(Success); @@ -5988,15 +6011,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /*trailingVerbatimArgs=*/1); break; - // Convert Packed Single Precision Floating-Point Values - // to Packed Signed Doubleword Integer Values - // - // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512 - // (<16 x float>, <16 x i32>, i16, i32) - case Intrinsic::x86_avx512_mask_cvtps2dq_512: - handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false); - break; - // AVX512 PMOV: Packed MOV, with truncation // Precisely handled by applying the same intrinsic to the shadow case Intrinsic::x86_avx512_mask_pmov_dw_512: @@ -6074,15 +6088,33 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleAVXGF2P8Affine(I); break; - case Intrinsic::fshl: - case Intrinsic::fshr: - handleFunnelShift(I); - break; + default: + return false; + } - case Intrinsic::is_constant: - // The result of llvm.is.constant() is always defined. - setShadow(&I, getCleanShadow(&I)); - setOrigin(&I, getCleanOrigin()); + return true; + } + + bool maybeHandleArmSIMDIntrinsic(IntrinsicInst &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::aarch64_neon_rshrn: + case Intrinsic::aarch64_neon_sqrshl: + case Intrinsic::aarch64_neon_sqrshrn: + case Intrinsic::aarch64_neon_sqrshrun: + case Intrinsic::aarch64_neon_sqshl: + case Intrinsic::aarch64_neon_sqshlu: + case Intrinsic::aarch64_neon_sqshrn: + case Intrinsic::aarch64_neon_sqshrun: + case Intrinsic::aarch64_neon_srshl: + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_uqrshl: + case Intrinsic::aarch64_neon_uqrshrn: + case Intrinsic::aarch64_neon_uqshl: + case Intrinsic::aarch64_neon_uqshrn: + case Intrinsic::aarch64_neon_urshl: + case Intrinsic::aarch64_neon_ushl: + // Not handled here: aarch64_neon_vsli (vector shift left and insert) + handleVectorShiftIntrinsic(I, /* Variable */ false); break; // TODO: handling max/min similarly to AND/OR may be more precise @@ -6233,17 +6265,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { break; } - case Intrinsic::scmp: - case Intrinsic::ucmp: { - handleShadowOr(I); - break; - } - default: - if (!handleUnknownIntrinsic(I)) - visitInstruction(I); - break; + return false; } + + return true; + } + + void visitIntrinsicInst(IntrinsicInst &I) { + if (maybeHandleCrossPlatformIntrinsic(I)) + return; + + if (maybeHandleX86SIMDIntrinsic(I)) + return; + + if (maybeHandleArmSIMDIntrinsic(I)) + return; + + if (maybeHandleUnknownIntrinsic(I)) + return; + + visitInstruction(I); } void visitLibAtomicLoad(CallBase &CB) { diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 1ddb8ae9518f..4acc3f2d8469 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -19,9 +19,11 @@ #include "llvm/Analysis/ConstraintSystem.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -170,10 +172,12 @@ struct State { DominatorTree &DT; LoopInfo &LI; ScalarEvolution &SE; + TargetLibraryInfo &TLI; SmallVector<FactOrCheck, 64> WorkList; - State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE) - : DT(DT), LI(LI), SE(SE) {} + State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE, + TargetLibraryInfo &TLI) + : DT(DT), LI(LI), SE(SE), TLI(TLI) {} /// Process block \p BB and add known facts to work-list. void addInfoFor(BasicBlock &BB); @@ -1109,10 +1113,54 @@ void State::addInfoForInductions(BasicBlock &BB) { } } +static bool getConstraintFromMemoryAccess(GetElementPtrInst &GEP, + uint64_t AccessSize, + CmpPredicate &Pred, Value *&A, + Value *&B, const DataLayout &DL, + const TargetLibraryInfo &TLI) { + auto Offset = collectOffsets(cast<GEPOperator>(GEP), DL); + if (!Offset.NW.hasNoUnsignedWrap()) + return false; + + if (Offset.VariableOffsets.size() != 1) + return false; + + uint64_t BitWidth = Offset.ConstantOffset.getBitWidth(); + auto &[Index, Scale] = Offset.VariableOffsets.front(); + // Bail out on non-canonical GEPs. + if (Index->getType()->getScalarSizeInBits() != BitWidth) + return false; + + ObjectSizeOpts Opts; + // Workaround for gep inbounds, ptr null, idx. + Opts.NullIsUnknownSize = true; + // Be conservative since we are not clear on whether an out of bounds access + // to the padding is UB or not. + Opts.RoundToAlign = true; + std::optional<TypeSize> Size = + getBaseObjectSize(Offset.BasePtr, DL, &TLI, Opts); + if (!Size || Size->isScalable()) + return false; + + // Index * Scale + ConstOffset + AccessSize <= AllocSize + // With nuw flag, we know that the index addition doesn't have unsigned wrap. + // If (AllocSize - (ConstOffset + AccessSize)) wraps around, there is no valid + // value for Index. + APInt MaxIndex = (APInt(BitWidth, Size->getFixedValue() - AccessSize, + /*isSigned=*/false, /*implicitTrunc=*/true) - + Offset.ConstantOffset) + .udiv(Scale); + Pred = ICmpInst::ICMP_ULE; + A = Index; + B = ConstantInt::get(Index->getType(), MaxIndex); + return true; +} + void State::addInfoFor(BasicBlock &BB) { addInfoForInductions(BB); + auto &DL = BB.getDataLayout(); - // True as long as long as the current instruction is guaranteed to execute. + // True as long as the current instruction is guaranteed to execute. bool GuaranteedToExecute = true; // Queue conditions and assumes. for (Instruction &I : BB) { @@ -1127,6 +1175,38 @@ void State::addInfoFor(BasicBlock &BB) { continue; } + auto AddFactFromMemoryAccess = [&](Value *Ptr, Type *AccessType) { + auto *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (!GEP) + return; + TypeSize AccessSize = DL.getTypeStoreSize(AccessType); + if (!AccessSize.isFixed()) + return; + if (GuaranteedToExecute) { + CmpPredicate Pred; + Value *A, *B; + if (getConstraintFromMemoryAccess(*GEP, AccessSize.getFixedValue(), + Pred, A, B, DL, TLI)) { + // The memory access is guaranteed to execute when BB is entered, + // hence the constraint holds on entry to BB. + WorkList.emplace_back(FactOrCheck::getConditionFact( + DT.getNode(I.getParent()), Pred, A, B)); + } + } else { + WorkList.emplace_back( + FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I)); + } + }; + + if (auto *LI = dyn_cast<LoadInst>(&I)) { + if (!LI->isVolatile()) + AddFactFromMemoryAccess(LI->getPointerOperand(), LI->getAccessType()); + } + if (auto *SI = dyn_cast<StoreInst>(&I)) { + if (!SI->isVolatile()) + AddFactFromMemoryAccess(SI->getPointerOperand(), SI->getAccessType()); + } + auto *II = dyn_cast<IntrinsicInst>(&I); Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic; switch (ID) { @@ -1420,7 +1500,7 @@ static std::optional<bool> checkCondition(CmpInst::Predicate Pred, Value *A, LLVM_DEBUG(dbgs() << "Checking " << *CheckInst << "\n"); auto R = Info.getConstraintForSolving(Pred, A, B); - if (R.empty() || !R.isValid(Info)){ + if (R.empty() || !R.isValid(Info)) { LLVM_DEBUG(dbgs() << " failed to decompose condition\n"); return std::nullopt; } @@ -1785,12 +1865,13 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info, static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE, - OptimizationRemarkEmitter &ORE) { + OptimizationRemarkEmitter &ORE, + TargetLibraryInfo &TLI) { bool Changed = false; DT.updateDFSNumbers(); SmallVector<Value *> FunctionArgs(llvm::make_pointer_range(F.args())); ConstraintInfo Info(F.getDataLayout(), FunctionArgs); - State S(DT, LI, SE); + State S(DT, LI, SE, TLI); std::unique_ptr<Module> ReproducerModule( DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr); @@ -1960,6 +2041,26 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, } continue; } + + auto &DL = F.getDataLayout(); + auto AddFactsAboutIndices = [&](Value *Ptr, Type *AccessType) { + CmpPredicate Pred; + Value *A, *B; + if (getConstraintFromMemoryAccess( + *cast<GetElementPtrInst>(Ptr), + DL.getTypeStoreSize(AccessType).getFixedValue(), Pred, A, B, DL, + TLI)) + AddFact(Pred, A, B); + }; + + if (auto *LI = dyn_cast<LoadInst>(CB.Inst)) { + AddFactsAboutIndices(LI->getPointerOperand(), LI->getAccessType()); + continue; + } + if (auto *SI = dyn_cast<StoreInst>(CB.Inst)) { + AddFactsAboutIndices(SI->getPointerOperand(), SI->getAccessType()); + continue; + } } Value *A = nullptr, *B = nullptr; @@ -2018,7 +2119,8 @@ PreservedAnalyses ConstraintEliminationPass::run(Function &F, auto &LI = AM.getResult<LoopAnalysis>(F); auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); - if (!eliminateConstraints(F, DT, LI, SE, ORE)) + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + if (!eliminateConstraints(F, DT, LI, SE, ORE, TLI)) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 434b55868c99..944b253e0f5e 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -521,7 +521,7 @@ private: Instruction *SIUse = dyn_cast<Instruction>(SI->user_back()); // The use of the select inst should be either a phi or another select. - if (!SIUse && !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse))) + if (!SIUse || !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse))) return false; BasicBlock *SIBB = SI->getParent(); @@ -581,15 +581,17 @@ struct AllSwitchPaths { VisitedBlocks VB; // Get paths from the determinator BBs to SwitchPhiDefBB std::vector<ThreadingPath> PathsToPhiDef = - getPathsFromStateDefMap(StateDef, SwitchPhi, VB); - if (SwitchPhiDefBB == SwitchBlock) { + getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths); + if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) { TPaths = std::move(PathsToPhiDef); return; } + assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty()); + auto PathsLimit = MaxNumPaths / PathsToPhiDef.size(); // Find and append paths from SwitchPhiDefBB to SwitchBlock. PathsType PathsToSwitchBB = - paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1); + paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit); if (PathsToSwitchBB.empty()) return; @@ -610,13 +612,16 @@ private: typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap; std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef, PHINode *Phi, - VisitedBlocks &VB) { + VisitedBlocks &VB, + unsigned PathsLimit) { std::vector<ThreadingPath> Res; auto *PhiBB = Phi->getParent(); VB.insert(PhiBB); VisitedBlocks UniqueBlocks; for (auto *IncomingBB : Phi->blocks()) { + if (Res.size() >= PathsLimit) + break; if (!UniqueBlocks.insert(IncomingBB).second) continue; if (!SwitchOuterLoop->contains(IncomingBB)) @@ -652,8 +657,9 @@ private: // Direct predecessor, just add to the path. if (IncomingPhiDefBB == IncomingBB) { - std::vector<ThreadingPath> PredPaths = - getPathsFromStateDefMap(StateDef, IncomingPhi, VB); + assert(PathsLimit > Res.size()); + std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap( + StateDef, IncomingPhi, VB, PathsLimit - Res.size()); for (ThreadingPath &Path : PredPaths) { Path.push_back(PhiBB); Res.push_back(std::move(Path)); @@ -666,13 +672,17 @@ private: continue; PathsType IntermediatePaths; - IntermediatePaths = - paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1); + assert(PathsLimit > Res.size()); + auto InterPathLimit = PathsLimit - Res.size(); + IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB, + /* PathDepth = */ 1, InterPathLimit); if (IntermediatePaths.empty()) continue; + assert(InterPathLimit >= IntermediatePaths.size()); + auto PredPathLimit = InterPathLimit / IntermediatePaths.size(); std::vector<ThreadingPath> PredPaths = - getPathsFromStateDefMap(StateDef, IncomingPhi, VB); + getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit); for (const ThreadingPath &Path : PredPaths) { for (const PathType &IPath : IntermediatePaths) { ThreadingPath NewPath(Path); @@ -687,7 +697,7 @@ private: } PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited, - unsigned PathDepth) { + unsigned PathDepth, unsigned PathsLimit) { PathsType Res; // Stop exploring paths after visiting MaxPathLength blocks @@ -714,6 +724,8 @@ private: // is used to prevent a duplicate path from being generated SmallPtrSet<BasicBlock *, 4> Successors; for (BasicBlock *Succ : successors(BB)) { + if (Res.size() >= PathsLimit) + break; if (!Successors.insert(Succ).second) continue; @@ -735,14 +747,12 @@ private: // coverage and compile time. if (LI->getLoopFor(Succ) != CurrLoop) continue; - - PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1); + assert(PathsLimit > Res.size()); + PathsType SuccPaths = + paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size()); for (PathType &Path : SuccPaths) { Path.push_front(BB); Res.push_back(Path); - if (Res.size() >= MaxNumPaths) { - return Res; - } } } // This block could now be visited again from a different predecessor. Note diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 4baa3b3eb824..26e17cc849bf 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2982,7 +2982,8 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, bool GVNPass::performScalarPRE(Instruction *CurInst) { if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() || isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() || - CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects()) + CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || + CurInst->getType()->isTokenLikeTy()) return false; // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp index e9bf59c6850a..b60b15b6c3a2 100644 --- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -15,6 +15,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" @@ -35,8 +36,38 @@ static bool tryToImproveAlign( return true; } } - // TODO: Also handle memory intrinsics. - return false; + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + if (!II) + return false; + + // TODO: Handle more memory intrinsics. + switch (II->getIntrinsicID()) { + case Intrinsic::masked_load: + case Intrinsic::masked_store: { + int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2; + Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load + ? II->getArgOperand(0) + : II->getArgOperand(1); + Type *Type = II->getIntrinsicID() == Intrinsic::masked_load + ? II->getType() + : II->getArgOperand(0)->getType(); + + Align OldAlign = + cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue(); + Align PrefAlign = DL.getPrefTypeAlign(Type); + Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign); + if (NewAlign <= OldAlign) + return false; + + Value *V = + ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value()); + II->setOperand(AlignOpIdx, V); + return true; + } + default: + return false; + } } bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) { diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index c2a737d8f9a4..c7d71eb5633e 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1437,9 +1437,18 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // AvailablePreds vector as we go so that all of the PHI entries for this // predecessor use the same bitcast. Value *&PredV = I->second; - if (PredV->getType() != LoadI->getType()) + if (PredV->getType() != LoadI->getType()) { PredV = CastInst::CreateBitOrPointerCast( PredV, LoadI->getType(), "", P->getTerminator()->getIterator()); + // The new cast is producing the value used to replace the load + // instruction, so uses the load's debug location. If P does not always + // branch to the load BB however then the debug location must be dropped, + // as it is hoisted past a conditional branch. + DebugLoc DL = P->getTerminator()->getNumSuccessors() == 1 + ? LoadI->getDebugLoc() + : DebugLoc::getDropped(); + cast<CastInst>(PredV)->setDebugLoc(DL); + } PN->addIncoming(PredV, I->first); } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 03b92d3338a9..0874b29ab7d2 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -39,6 +39,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CmpInstAnalysis.h" +#include "llvm/Analysis/HashRecognize.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemoryLocation.h" @@ -143,6 +144,14 @@ static cl::opt<bool, true> cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden); +bool DisableLIRP::HashRecognize; +static cl::opt<bool, true> + DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", + cl::desc("Proceed with loop idiom recognize pass, " + "but do not optimize CRC loops."), + cl::location(DisableLIRP::HashRecognize), + cl::init(false), cl::ReallyHidden); + static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " @@ -242,6 +251,7 @@ private: const SCEV *BECount); bool avoidLIRForMultiBlockLoop(bool IsMemset = false, bool IsLoopMemset = false); + bool optimizeCRCLoop(const PolynomialInfo &Info); /// @} /// \name Noncountable Loop Idiom Handling @@ -287,6 +297,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); + std::optional<PolynomialInfo> HR; + LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, AR.MSSA, DL, ORE); if (!LIR.runOnLoop(&L)) @@ -335,7 +347,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); - if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy) + if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || + HasMemcpy || !DisableLIRP::HashRecognize) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -378,6 +391,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() { MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); } + + // Optimize a CRC loop if HashRecognize found one, provided we're not + // optimizing for size. + if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics) + if (auto Res = HashRecognize(*CurLoop, *SE).getResult()) + optimizeCRCLoop(*Res); + return MadeChange; } @@ -1514,6 +1534,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, return false; } +bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) { + // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using + // carry-less multiplication instructions, which is more efficient than our + // Sarwate table-lookup optimization. Hence, until we're able to emit + // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom, + // disable the optimization for Hexagon. + Module &M = *CurLoop->getHeader()->getModule(); + Triple TT(M.getTargetTriple()); + if (TT.getArch() == Triple::hexagon) + return false; + + // First, create a new GlobalVariable corresponding to the + // Sarwate-lookup-table. + Type *CRCTy = Info.LHS->getType(); + unsigned CRCBW = CRCTy->getIntegerBitWidth(); + std::array<Constant *, 256> CRCConstants; + transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped), + CRCConstants.begin(), + [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); }); + Constant *ConstArray = + ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants); + GlobalVariable *GV = + new GlobalVariable(M, ConstArray->getType(), true, + GlobalValue::PrivateLinkage, ConstArray, ".crctable"); + + PHINode *IV = CurLoop->getCanonicalInductionVariable(); + SmallVector<PHINode *, 2> Cleanup; + + // Next, mark all PHIs for removal except IV. + { + for (PHINode &PN : CurLoop->getHeader()->phis()) { + if (&PN == IV) + continue; + PN.replaceAllUsesWith(PoisonValue::get(PN.getType())); + Cleanup.push_back(&PN); + } + } + + // Next, fix up the trip count. + { + unsigned NewBTC = (Info.TripCount / 8) - 1; + BasicBlock *LoopBlk = CurLoop->getLoopLatch(); + BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator()); + CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk + ? ICmpInst::Predicate::ICMP_NE + : ICmpInst::Predicate::ICMP_EQ; + Instruction *ExitCond = CurLoop->getLatchCmpInst(); + Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC); + IRBuilder<> Builder(ExitCond); + Value *NewExitCond = + Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond"); + ExitCond->replaceAllUsesWith(NewExitCond); + deleteDeadInstruction(ExitCond); + } + + // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all + // uses of ComputedValue. + // + // Little-endian: + // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)] + // Big-Endian: + // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)] + { + auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) { + Type *OpTy = Op->getType(); + unsigned OpBW = OpTy->getIntegerBitWidth(); + return OpBW > 8 + ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name) + : Op; + }; + auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op, + const Twine &Name) { + Type *OpTy = Op->getType(); + + // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to + // use the CRC's bitwidth as the reference for shifting right. + return LoByte(Builder, + CRCBW > 8 ? Builder.CreateLShr( + Op, ConstantInt::get(OpTy, CRCBW - 8), Name) + : Op, + Name + ".lo.byte"); + }; + + IRBuilder<> Builder(CurLoop->getHeader(), + CurLoop->getHeader()->getFirstNonPHIIt()); + + // Create the CRC PHI, and initialize its incoming value to the initial + // value of CRC. + PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc"); + CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader()); + + // CRC is now an evolving variable, initialized to the PHI. + Value *CRC = CRCPhi; + + // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte + // of LHSAux), if LHSAux is non-nullptr. + Value *Indexer = CRC; + if (Value *Data = Info.LHSAux) { + Type *DataTy = Data->getType(); + + // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we + // shift right by that amount, and take the lo-byte (in the little-endian + // case), or shift left by that amount, and take the hi-idx (in the + // big-endian case). + Value *IVBits = Builder.CreateZExtOrTrunc( + Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer"); + Value *DataIndexer = + Info.ByteOrderSwapped + ? Builder.CreateShl(Data, IVBits, "data.indexer") + : Builder.CreateLShr(Data, IVBits, "data.indexer"); + Indexer = Builder.CreateXor( + DataIndexer, + Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"), + "crc.data.indexer"); + } + + Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi") + : LoByte(Builder, Indexer, "indexer.lo"); + + // Always index into a GEP using the index type. + Indexer = Builder.CreateZExt( + Indexer, SE->getDataLayout().getIndexType(GV->getType()), + "indexer.ext"); + + // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC]. + Value *CRCTableGEP = + Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd"); + Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld"); + + // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of + // CRC-8. + Value *CRCNext = CRCTableLd; + if (CRCBW > 8) { + Value *CRCShift = Info.ByteOrderSwapped + ? Builder.CreateShl(CRC, 8, "crc.be.shift") + : Builder.CreateLShr(CRC, 8, "crc.le.shift"); + CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next"); + } + + // Connect the back-edge for the loop, and RAUW the ComputedValue. + CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch()); + Info.ComputedValue->replaceUsesOutsideBlock(CRCNext, + CurLoop->getLoopLatch()); + } + + // Cleanup. + { + for (PHINode *PN : Cleanup) + RecursivelyDeleteDeadPHINode(PN); + SE->forgetLoop(CurLoop); + } + return true; +} + bool LoopIdiomRecognize::runOnNoncountableLoop() { LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << CurLoop->getHeader()->getParent()->getName() diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index f7d2258e1c28..2bda9d83236e 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -220,6 +220,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze; UP.SCEVExpansionBudget = SCEVCheapExpansionBudget; UP.RuntimeUnrollMultiExit = false; + UP.AddAdditionalAccumulators = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, SE, UP, &ORE); @@ -1354,6 +1355,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, ULO.Heart = getLoopConvergenceHeart(L); ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget; ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit; + ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators; LoopUnrollResult UnrollResult = UnrollLoop( L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA); if (UnrollResult == LoopUnrollResult::Unmodified) diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 8b9d06d7e443..8a5569743ab4 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -247,8 +247,8 @@ private: /// index I' according to UserChain produced by function "find". /// /// The building conceptually takes two steps: - /// 1) iteratively distribute s/zext towards the leaves of the expression tree - /// that computes I + /// 1) iteratively distribute sext/zext/trunc towards the leaves of the + /// expression tree that computes I /// 2) reassociate the expression tree to the form I' + C. /// /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute @@ -260,29 +260,30 @@ private: Value *rebuildWithoutConstOffset(); /// After the first step of rebuilding the GEP index without the constant - /// offset, distribute s/zext to the operands of all operators in UserChain. - /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => + /// offset, distribute sext/zext/trunc to the operands of all operators in + /// UserChain. e.g., zext(sext(a + (b + 5)) (assuming no overflow) => /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). /// /// The function also updates UserChain to point to new subexpressions after - /// distributing s/zext. e.g., the old UserChain of the above example is - /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), + /// distributing sext/zext/trunc. e.g., the old UserChain of the above example + /// is + /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), /// and the new UserChain is - /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> - /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) + /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> + /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) /// /// \p ChainIndex The index to UserChain. ChainIndex is initially /// UserChain.size() - 1, and is decremented during /// the recursion. - Value *distributeExtsAndCloneChain(unsigned ChainIndex); + Value *distributeCastsAndCloneChain(unsigned ChainIndex); /// Reassociates the GEP index to the form I' + C and returns I'. Value *removeConstOffset(unsigned ChainIndex); - /// A helper function to apply ExtInsts, a list of s/zext, to value V. - /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function + /// A helper function to apply CastInsts, a list of sext/zext/trunc, to value + /// V. e.g., if CastInsts = [sext i32 to i64, zext i16 to i32], this function /// returns "sext i32 (zext i16 V to i32) to i64". - Value *applyExts(Value *V); + Value *applyCasts(Value *V); /// A helper function that returns whether we can trace into the operands /// of binary operator BO for a constant offset. @@ -307,8 +308,8 @@ private: SmallVector<User *, 8> UserChain; /// A data structure used in rebuildWithoutConstOffset. Contains all - /// sext/zext instructions along UserChain. - SmallVector<CastInst *, 16> ExtInsts; + /// sext/zext/trunc instructions along UserChain. + SmallVector<CastInst *, 16> CastInsts; /// Insertion position of cloned instructions. BasicBlock::iterator IP; @@ -491,7 +492,7 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, } Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); - // Do not trace into "or" unless it is equivalent to "add". + // Do not trace into "or" unless it is equivalent to "add nuw nsw". // This is the case if the or's disjoint flag is set. if (BO->getOpcode() == Instruction::Or && !cast<PossiblyDisjointInst>(BO)->isDisjoint()) @@ -503,8 +504,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, if (ZeroExtended && !SignExtended && BO->getOpcode() == Instruction::Sub) return false; - // In addition, tracing into BO requires that its surrounding s/zext (if - // any) is distributable to both operands. + // In addition, tracing into BO requires that its surrounding sext/zext/trunc + // (if any) is distributable to both operands. // // Suppose BO = A op B. // SignExtended | ZeroExtended | Distributable? @@ -628,11 +629,11 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, return ConstantOffset; } -Value *ConstantOffsetExtractor::applyExts(Value *V) { +Value *ConstantOffsetExtractor::applyCasts(Value *V) { Value *Current = V; - // ExtInsts is built in the use-def order. Therefore, we apply them to V + // CastInsts is built in the use-def order. Therefore, we apply them to V // in the reversed order. - for (CastInst *I : llvm::reverse(ExtInsts)) { + for (CastInst *I : llvm::reverse(CastInsts)) { if (Constant *C = dyn_cast<Constant>(Current)) { // Try to constant fold the cast. Current = ConstantFoldCastOperand(I->getOpcode(), C, I->getType(), DL); @@ -640,24 +641,24 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) { continue; } - Instruction *Ext = I->clone(); - Ext->setOperand(0, Current); + Instruction *Cast = I->clone(); + Cast->setOperand(0, Current); // In ConstantOffsetExtractor::find we do not analyze nuw/nsw for trunc, so // we assume that it is ok to redistribute trunc over add/sub/or. But for // example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc // nuw (add A, B))). To make such redistributions legal we drop all the // poison generating flags from cloned trunc instructions here. - if (isa<TruncInst>(Ext)) - Ext->dropPoisonGeneratingFlags(); - Ext->insertBefore(*IP->getParent(), IP); - Current = Ext; + if (isa<TruncInst>(Cast)) + Cast->dropPoisonGeneratingFlags(); + Cast->insertBefore(*IP->getParent(), IP); + Current = Cast; } return Current; } Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { - distributeExtsAndCloneChain(UserChain.size() - 1); - // Remove all nullptrs (used to be s/zext) from UserChain. + distributeCastsAndCloneChain(UserChain.size() - 1); + // Remove all nullptrs (used to be sext/zext/trunc) from UserChain. unsigned NewSize = 0; for (User *I : UserChain) { if (I != nullptr) { @@ -670,29 +671,29 @@ Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { } Value * -ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { +ConstantOffsetExtractor::distributeCastsAndCloneChain(unsigned ChainIndex) { User *U = UserChain[ChainIndex]; if (ChainIndex == 0) { assert(isa<ConstantInt>(U)); - // If U is a ConstantInt, applyExts will return a ConstantInt as well. - return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); + // If U is a ConstantInt, applyCasts will return a ConstantInt as well. + return UserChain[ChainIndex] = cast<ConstantInt>(applyCasts(U)); } if (CastInst *Cast = dyn_cast<CastInst>(U)) { assert( (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) && "Only following instructions can be traced: sext, zext & trunc"); - ExtInsts.push_back(Cast); + CastInsts.push_back(Cast); UserChain[ChainIndex] = nullptr; - return distributeExtsAndCloneChain(ChainIndex - 1); + return distributeCastsAndCloneChain(ChainIndex - 1); } // Function find only trace into BinaryOperator and CastInst. BinaryOperator *BO = cast<BinaryOperator>(U); // OpNo = which operand of BO is UserChain[ChainIndex - 1] unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); - Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); - Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); + Value *TheOther = applyCasts(BO->getOperand(1 - OpNo)); + Value *NextInChain = distributeCastsAndCloneChain(ChainIndex - 1); BinaryOperator *NewBO = nullptr; if (OpNo == 0) { @@ -713,7 +714,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); assert((BO->use_empty() || BO->hasOneUse()) && - "distributeExtsAndCloneChain clones each BinaryOperator in " + "distributeCastsAndCloneChain clones each BinaryOperator in " "UserChain, so no one should be used more than " "once"); @@ -847,7 +848,8 @@ static bool allowsPreservingNUW(const User *U) { // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)" if (const TruncInst *TI = dyn_cast<TruncInst>(U)) return TI->hasNoUnsignedWrap(); - return isa<CastInst>(U) || isa<ConstantInt>(U); + assert((isa<CastInst>(U) || isa<ConstantInt>(U)) && "Unexpected User."); + return true; } Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 9b40fc03da6b..e4ba70d1bce1 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -98,6 +98,9 @@ static cl::opt<bool> EnableUnswitchCostMultiplier( static cl::opt<int> UnswitchSiblingsToplevelDiv( "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden, cl::desc("Toplevel siblings divisor for cost multiplier.")); +static cl::opt<int> UnswitchParentBlocksDiv( + "unswitch-parent-blocks-div", cl::init(8), cl::Hidden, + cl::desc("Outer loop size divisor for cost multiplier.")); static cl::opt<int> UnswitchNumInitialUnscaledCandidates( "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden, cl::desc("Number of unswitch candidates that are ignored when calculating " @@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, } /// Cost multiplier is a way to limit potentially exponential behavior -/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch -/// candidates available. Also accounting for the number of "sibling" loops with -/// the idea to account for previous unswitches that already happened on this +/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch +/// candidates available. Also consider the number of "sibling" loops with +/// the idea of accounting for previous unswitches that already happened on this /// cluster of loops. There was an attempt to keep this formula simple, /// just enough to limit the worst case behavior. Even if it is not that simple /// now it is still not an attempt to provide a detailed heuristic size @@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier( return 1; } + // Each invariant non-trivial condition, after being unswitched, is supposed + // to have its own specialized sibling loop (the invariant condition has been + // hoisted out of the child loop into a newly-cloned loop). When unswitching + // conditions in nested loops, the basic block size of the outer loop should + // not be altered. If such a size significantly increases across unswitching + // invocations, something may be wrong; so adjust the final cost taking this + // into account. auto *ParentL = L.getParentLoop(); + int ParentLoopSizeMultiplier = 1; + if (ParentL) + ParentLoopSizeMultiplier = + std::max<int>(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1); + int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() : std::distance(LI.begin(), LI.end())); // Count amount of clones that all the candidates might cause during @@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier( // at an upper bound. int CostMultiplier; if (ClonesPower > Log2_32(UnswitchThreshold) || - SiblingsMultiplier > UnswitchThreshold) + SiblingsMultiplier > UnswitchThreshold || + ParentLoopSizeMultiplier > UnswitchThreshold) CostMultiplier = UnswitchThreshold; else CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower), (int)UnswitchThreshold); LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier - << " (siblings " << SiblingsMultiplier << " * clones " + << " (siblings " << SiblingsMultiplier << " * parent size " + << ParentLoopSizeMultiplier << " * clones " << (1 << ClonesPower) << ")" << " for unswitch candidate: " << TI << "\n"); return CostMultiplier; diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index bb7dbc2980f5..e05625344ee2 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -997,7 +997,8 @@ void StructurizeCFG::simplifyHoistedPhis() { continue; OtherPhi->setIncomingValue(PoisonValBBIdx, V); - Phi->setIncomingValue(i, OtherV); + if (DT->dominates(OtherV, Phi)) + Phi->setIncomingValue(i, OtherV); } } } diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp index c76b3afef50c..27b13eeaf4d7 100644 --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -1285,7 +1285,7 @@ private: // Cache misses on the merged chain double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount; double MergedSize = ChainPred->Size + ChainSucc->Size; - double MergedDensity = static_cast<double>(MergedCounts) / MergedSize; + double MergedDensity = MergedCounts / MergedSize; double NewScore = MergedCounts * missProbability(MergedDensity); return CurScore - NewScore; diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index 7063cde5263b..5a09b7385f2b 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -254,7 +254,6 @@ bool llvm::applyDebugifyMetadata( } if (ApplyToMF) ApplyToMF(DIB, F); - DIB.finalizeSubprogram(SP); } DIB.finalize(); diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp index 3bbe875bbe9e..1a9e16be6989 100644 --- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -13,6 +13,8 @@ #include "llvm/Transforms/Utils/FunctionImportUtils.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/TimeProfiler.h" + using namespace llvm; /// Uses the "source_filename" instead of a Module hash ID for the suffix of @@ -370,6 +372,7 @@ void FunctionImportGlobalProcessing::run() { processGlobalsForThinLTO(); } void llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index, bool ClearDSOLocalOnDeclarations, SetVector<GlobalValue *> *GlobalsToImport) { + llvm::TimeTraceScope timeScope("Rename module for ThinLTO"); FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport, ClearDSOLocalOnDeclarations); ThinLTOProcessing.run(); diff --git a/llvm/lib/Transforms/Utils/IRNormalizer.cpp b/llvm/lib/Transforms/Utils/IRNormalizer.cpp index ad91318ae474..fefa49f68c8d 100644 --- a/llvm/lib/Transforms/Utils/IRNormalizer.cpp +++ b/llvm/lib/Transforms/Utils/IRNormalizer.cpp @@ -427,7 +427,7 @@ void IRNormalizer::reorderInstructions(Function &F) const { // Process the remaining instructions. // // TODO: Do more a intelligent sorting of these instructions. For example, - // seperate between dead instructinos and instructions used in another + // separate between dead instructinos and instructions used in another // block. Use properties of the CFG the order instructions that are used // in another block. if (Visited.contains(&I)) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index ac344904f90f..2cfd70a1746c 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3397,8 +3397,8 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C, if (FP && Ty.isFloatingPointTy() && Ty.getScalarSizeInBits() <= 64) { const APFloat &APF = FP->getValueAPF(); APInt const &API = APF.bitcastToAPInt(); - if (auto Temp = API.getZExtValue()) - return DIB.createConstantValueExpression(static_cast<uint64_t>(Temp)); + if (uint64_t Temp = API.getZExtValue()) + return DIB.createConstantValueExpression(Temp); return DIB.createConstantValueExpression(*API.getRawData()); } @@ -3838,8 +3838,8 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin( bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) { const auto *Op = I->getOperand(OpIdx); - // We can't have a PHI with a metadata type. - if (Op->getType()->isMetadataTy()) + // We can't have a PHI with a metadata or token type. + if (Op->getType()->isMetadataTy() || Op->getType()->isTokenLikeTy()) return false; // swifterror pointers can only be used by a load, store, or as a swifterror diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index ba0ac01cadd8..735bad1cb134 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -225,9 +225,9 @@ protected: // Auxiliary function to calculate the number of iterations for a comparison // instruction or a binary operator. - PeelCounter mergeTwoCounter(const Instruction &CmpOrBinaryOp, - const PeelCounterValue &LHS, - const PeelCounterValue &RHS) const; + PeelCounter mergeTwoCounters(const Instruction &CmpOrBinaryOp, + const PeelCounterValue &LHS, + const PeelCounterValue &RHS) const; // Returns true if the \p Phi is an induction in the target loop. This is a // lightweight check and possible to detect an IV in some cases. @@ -269,15 +269,13 @@ bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const { break; // Avoid infinite loop. - if (Visited.contains(Cur)) + if (!Visited.insert(Cur).second) return false; auto *I = dyn_cast<Instruction>(Cur); if (!I || !L.contains(I)) return false; - Visited.insert(Cur); - if (auto *Cast = dyn_cast<CastInst>(I)) { Cur = Cast->getOperand(0); } else if (auto *BinOp = dyn_cast<BinaryOperator>(I)) { @@ -300,14 +298,14 @@ bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const { /// When either \p LHS or \p RHS is an IV, the result of \p CmpOrBinaryOp is /// considered an IV only if it is an addition or a subtraction. Otherwise the -/// result can be a value that is neither an loop-invariant nor an IV. +/// result can be a value that is neither a loop-invariant nor an IV. /// /// If both \p LHS and \p RHS are loop-invariants, then the result of /// \CmpOrBinaryOp is also a loop-invariant. PhiAnalyzer::PeelCounter -PhiAnalyzer::mergeTwoCounter(const Instruction &CmpOrBinaryOp, - const PeelCounterValue &LHS, - const PeelCounterValue &RHS) const { +PhiAnalyzer::mergeTwoCounters(const Instruction &CmpOrBinaryOp, + const PeelCounterValue &LHS, + const PeelCounterValue &RHS) const { auto &[LVal, LTy] = LHS; auto &[RVal, RTy] = RHS; unsigned NewVal = std::max(LVal, RVal); @@ -380,7 +378,7 @@ PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) { if (RHS == Unknown) return Unknown; return (IterationsToInvarianceOrInduction[I] = - mergeTwoCounter(*I, *LHS, *RHS)); + mergeTwoCounters(*I, *LHS, *RHS)); } if (I->isCast()) // Cast instructions get the value of the operand. diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 86b268de43cf..b18aceaa67d7 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -41,6 +41,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -108,6 +109,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden, #endif ); +static cl::opt<bool> UnrollAddParallelReductions( + "unroll-add-parallel-reductions", cl::init(false), cl::Hidden, + cl::desc("Allow unrolling to add parallel reduction phis.")); /// Check if unrolling created a situation where we need to insert phi nodes to /// preserve LCSSA form. @@ -660,6 +664,41 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, OrigPHINode.push_back(cast<PHINode>(I)); } + // Collect phi nodes for reductions for which we can introduce multiple + // parallel reduction phis and compute the final reduction result after the + // loop. This requires a single exit block after unrolling. This is ensured by + // restricting to single-block loops where the unrolled iterations are known + // to not exit. + DenseMap<PHINode *, RecurrenceDescriptor> Reductions; + bool CanAddAdditionalAccumulators = + (UnrollAddParallelReductions.getNumOccurrences() > 0 + ? UnrollAddParallelReductions + : ULO.AddAdditionalAccumulators) && + !CompletelyUnroll && L->getNumBlocks() == 1 && + (ULO.Runtime || + (ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 && + ExitInfos[Header].BreakoutTrip == 0)))); + + // Limit parallelizing reductions to unroll counts of 4 or less for now. + // TODO: The number of parallel reductions should depend on the number of + // execution units. We also don't have to add a parallel reduction phi per + // unrolled iteration, but could for example add a parallel phi for every 2 + // unrolled iterations. + if (CanAddAdditionalAccumulators && ULO.Count <= 4) { + for (PHINode &Phi : Header->phis()) { + auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE); + if (!RdxDesc) + continue; + + // Only handle duplicate phis for a single reduction for now. + // TODO: Handle any number of reductions + if (!Reductions.empty()) + continue; + + Reductions[&Phi] = *RdxDesc; + } + } + std::vector<BasicBlock *> Headers; std::vector<BasicBlock *> Latches; Headers.push_back(Header); @@ -710,6 +749,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // latch. This is a reasonable default placement if we don't have block // frequencies, and if we do, well the layout will be adjusted later. auto BlockInsertPt = std::next(LatchBlock->getIterator()); + SmallVector<Instruction *> PartialReductions; for (unsigned It = 1; It != ULO.Count; ++It) { SmallVector<BasicBlock *, 8> NewBlocks; SmallDenseMap<const Loop *, Loop *, 4> NewLoops; @@ -733,6 +773,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, for (PHINode *OrigPHI : OrigPHINode) { PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]); Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock); + + // Use cloned phis as parallel phis for partial reductions, which will + // get combined to the final reduction result after the loop. + if (Reductions.contains(OrigPHI)) { + // Collect partial reduction results. + if (PartialReductions.empty()) + PartialReductions.push_back(cast<Instruction>(InVal)); + PartialReductions.push_back(cast<Instruction>(VMap[InVal])); + + // Update the start value for the cloned phis to use the identity + // value for the reduction. + const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI]; + NewPHI->setIncomingValueForBlock( + L->getLoopPreheader(), + getRecurrenceIdentity(RdxDesc.getRecurrenceKind(), + OrigPHI->getType(), + RdxDesc.getFastMathFlags())); + + // Update NewPHI to use the cloned value for the iteration and move + // to header. + NewPHI->replaceUsesOfWith(InVal, VMap[InVal]); + NewPHI->moveBefore(OrigPHI->getIterator()); + continue; + } + if (Instruction *InValI = dyn_cast<Instruction>(InVal)) if (It > 1 && L->contains(InValI)) InVal = LastValueMap[InValI]; @@ -832,6 +897,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader)); PN->eraseFromParent(); } else if (ULO.Count > 1) { + if (Reductions.contains(PN)) + continue; + Value *InVal = PN->removeIncomingValue(LatchBlock, false); // If this value was defined in the loop, take the value defined by the // last iteration of the loop. @@ -1010,6 +1078,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } } + // If there are partial reductions, create code in the exit block to compute + // the final result and update users of the final result. + if (!PartialReductions.empty()) { + BasicBlock *ExitBlock = L->getExitBlock(); + assert(ExitBlock && + "Can only introduce parallel reduction phis with single exit block"); + assert(Reductions.size() == 1 && + "currently only a single reduction is supported"); + Value *FinalRdxValue = PartialReductions.back(); + Value *RdxResult = nullptr; + for (PHINode &Phi : ExitBlock->phis()) { + if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue) + continue; + if (!RdxResult) { + RdxResult = PartialReductions.front(); + IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt()); + RecurKind RK = Reductions.begin()->second.getRecurrenceKind(); + for (Instruction *RdxPart : drop_begin(PartialReductions)) { + RdxResult = Builder.CreateBinOp( + (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK), + RdxPart, RdxResult, "bin.rdx"); + } + NeedToFixLCSSA = true; + for (Instruction *RdxPart : PartialReductions) + RdxPart->dropPoisonGeneratingFlags(); + } + + Phi.replaceAllUsesWith(RdxResult); + continue; + } + } + if (DTUToUse) { // Apply updates to the DomTree. DT = &DTU.getDomTree(); @@ -1111,3 +1211,41 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) { } return nullptr; } + +std::optional<RecurrenceDescriptor> +llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L, + ScalarEvolution *SE) { + RecurrenceDescriptor RdxDesc; + if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RdxDesc, + /*DemandedBits=*/nullptr, + /*AC=*/nullptr, /*DT=*/nullptr, SE)) + return std::nullopt; + RecurKind RK = RdxDesc.getRecurrenceKind(); + // Skip unsupported reductions. + // TODO: Handle additional reductions, including FP and min-max + // reductions. + if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) || + RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || + RecurrenceDescriptor::isFindIVRecurrenceKind(RK) || + RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) + return std::nullopt; + + if (RdxDesc.IntermediateStore) + return std::nullopt; + + // Don't unroll reductions with constant ops; those can be folded to a + // single induction update. + if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch())) + ->operands(), + IsaPred<Constant>)) + return std::nullopt; + + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch || + !is_contained( + cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(), + &Phi)) + return std::nullopt; + + return RdxDesc; +} diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 843364eb34f8..b172ef6ba080 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -2032,6 +2032,7 @@ Value *llvm::addRuntimeChecks( MemoryRuntimeCheck = IsConflict; } + Exp.eraseDeadInstructions(MemoryRuntimeCheck); return MemoryRuntimeCheck; } @@ -2077,6 +2078,7 @@ Value *llvm::addDiffRuntimeChecks( MemoryRuntimeCheck = IsConflict; } + Expander.eraseDeadInstructions(MemoryRuntimeCheck); return MemoryRuntimeCheck; } diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 1711163fb9f5..ec2e6c1ab796 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -81,6 +81,8 @@ void LoopVersioning::versionLoop( } else RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck; + Exp.eraseDeadInstructions(SCEVRuntimeCheck); + assert(RuntimeCheck && "called even though we don't need " "any runtime checks"); diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp index 41647f7717a4..faacd422c009 100644 --- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp +++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp @@ -155,12 +155,15 @@ PreservedAnalyses ProfileVerifierPass::run(Function &F, FunctionAnalysisManager &FAM) { const auto EntryCount = F.getEntryCount(/*AllowSynthetic=*/true); if (!EntryCount) { - F.getContext().emitError("Profile verification failed: function entry " - "count missing (set to 0 if cold)"); + auto *MD = F.getMetadata(LLVMContext::MD_prof); + if (!MD || !isExplicitlyUnknownProfileMetadata(*MD)) { + F.getContext().emitError("Profile verification failed: function entry " + "count missing (set to 0 if cold)"); + return PreservedAnalyses::all(); + } + } else if (EntryCount->getCount() == 0) { return PreservedAnalyses::all(); } - if (EntryCount->getCount() == 0) - return PreservedAnalyses::all(); for (const auto &BB : F) { if (AnnotateSelect) { for (const auto &I : BB) diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 10c162bc6463..d93a4d87f30f 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -849,9 +849,12 @@ void PromoteMem2Reg::run() { for (unsigned i = 0, e = Allocas.size(); i != e; ++i) IncomingVals.init(i, UndefValue::get(Allocas[i]->getAllocatedType())); - // When handling debug info, treat all incoming values as if they have unknown - // locations until proven otherwise. + // When handling debug info, treat all incoming values as if they have + // compiler-generated (empty) locations, representing the uninitialized + // alloca, until proven otherwise. IncomingLocs.resize(Allocas.size()); + for (unsigned i = 0, e = Allocas.size(); i != e; ++i) + IncomingLocs.init(i, DebugLoc::getCompilerGenerated()); // The renamer uses the Visited set to avoid infinite loops. Visited.resize(F.getMaxBlockNumber(), false); diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp index d53a3144bf57..a814867652cd 100644 --- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp +++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp @@ -21,29 +21,20 @@ using namespace llvm; -static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { +struct LookupTableInfo { + Value *Index; + SmallVector<Constant *> Ptrs; +}; + +static bool shouldConvertToRelLookupTable(LookupTableInfo &Info, Module &M, + GlobalVariable &GV) { // If lookup table has more than one user, // do not generate a relative lookup table. // This is to simplify the analysis that needs to be done for this pass. // TODO: Add support for lookup tables with multiple uses. // For ex, this can happen when a function that uses a lookup table gets // inlined into multiple call sites. - if (!GV.hasInitializer() || - !GV.isConstant() || - !GV.hasOneUse()) - return false; - - GetElementPtrInst *GEP = - dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser()); - if (!GEP || !GEP->hasOneUse() || - GV.getValueType() != GEP->getSourceElementType()) - return false; - - LoadInst *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser()); - if (!Load || !Load->hasOneUse() || - Load->getType() != GEP->getResultElementType()) - return false; - + // // If the original lookup table does not have local linkage and is // not dso_local, do not generate a relative lookup table. // This optimization creates a relative lookup table that consists of @@ -51,21 +42,40 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { // To be able to generate these offsets, relative lookup table and // its elements should have internal linkage and be dso_local, which means // that they should resolve to symbols within the same linkage unit. - if (!GV.hasLocalLinkage() || - !GV.isDSOLocal() || - !GV.isImplicitDSOLocal()) + if (!GV.hasInitializer() || !GV.isConstant() || !GV.hasOneUse() || + !GV.hasLocalLinkage() || !GV.isDSOLocal() || !GV.isImplicitDSOLocal()) return false; - ConstantArray *Array = dyn_cast<ConstantArray>(GV.getInitializer()); - if (!Array) + auto *GEP = dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser()); + if (!GEP || !GEP->hasOneUse()) + return false; + + auto *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser()); + if (!Load || !Load->hasOneUse()) return false; // If values are not 64-bit pointers, do not generate a relative lookup table. const DataLayout &DL = M.getDataLayout(); - Type *ElemType = Array->getType()->getElementType(); + Type *ElemType = Load->getType(); if (!ElemType->isPointerTy() || DL.getPointerTypeSizeInBits(ElemType) != 64) return false; + // Make sure this is a gep of the form GV + scale*var. + unsigned IndexWidth = + DL.getIndexTypeSizeInBits(Load->getPointerOperand()->getType()); + SmallMapVector<Value *, APInt, 4> VarOffsets; + APInt ConstOffset(IndexWidth, 0); + if (!GEP->collectOffset(DL, IndexWidth, VarOffsets, ConstOffset) || + !ConstOffset.isZero() || VarOffsets.size() != 1) + return false; + + // This can't be a pointer lookup table if the stride is smaller than a + // pointer. + Info.Index = VarOffsets.front().first; + const APInt &Stride = VarOffsets.front().second; + if (Stride.ult(DL.getTypeStoreSize(ElemType))) + return false; + SmallVector<GlobalVariable *, 4> GVOps; Triple TT = M.getTargetTriple(); // FIXME: This should be removed in the future. @@ -80,14 +90,20 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { // https://github.com/rust-lang/rust/issues/141306. || (TT.isX86() && TT.isOSDarwin()); - for (const Use &Op : Array->operands()) { - Constant *ConstOp = cast<Constant>(&Op); + APInt Offset(IndexWidth, 0); + uint64_t GVSize = DL.getTypeAllocSize(GV.getValueType()); + for (; Offset.ult(GVSize); Offset += Stride) { + Constant *C = + ConstantFoldLoadFromConst(GV.getInitializer(), ElemType, Offset, DL); + if (!C) + return false; + GlobalValue *GVOp; - APInt Offset; + APInt GVOffset; // If an operand is not a constant offset from a lookup table, // do not generate a relative lookup table. - if (!IsConstantOffsetFromGlobal(ConstOp, GVOp, Offset, DL)) + if (!IsConstantOffsetFromGlobal(C, GVOp, GVOffset, DL)) return false; // If operand is mutable, do not generate a relative lookup table. @@ -102,6 +118,8 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { if (ShouldDropUnnamedAddr) GVOps.push_back(GlovalVarOp); + + Info.Ptrs.push_back(C); } if (ShouldDropUnnamedAddr) @@ -111,14 +129,12 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { return true; } -static GlobalVariable *createRelLookupTable(Function &Func, +static GlobalVariable *createRelLookupTable(LookupTableInfo &Info, + Function &Func, GlobalVariable &LookupTable) { Module &M = *Func.getParent(); - ConstantArray *LookupTableArr = - cast<ConstantArray>(LookupTable.getInitializer()); - unsigned NumElts = LookupTableArr->getType()->getNumElements(); ArrayType *IntArrayTy = - ArrayType::get(Type::getInt32Ty(M.getContext()), NumElts); + ArrayType::get(Type::getInt32Ty(M.getContext()), Info.Ptrs.size()); GlobalVariable *RelLookupTable = new GlobalVariable( M, IntArrayTy, LookupTable.isConstant(), LookupTable.getLinkage(), @@ -127,10 +143,9 @@ static GlobalVariable *createRelLookupTable(Function &Func, LookupTable.isExternallyInitialized()); uint64_t Idx = 0; - SmallVector<Constant *, 64> RelLookupTableContents(NumElts); + SmallVector<Constant *, 64> RelLookupTableContents(Info.Ptrs.size()); - for (Use &Operand : LookupTableArr->operands()) { - Constant *Element = cast<Constant>(Operand); + for (Constant *Element : Info.Ptrs) { Type *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); Constant *Base = llvm::ConstantExpr::getPtrToInt(RelLookupTable, IntPtrTy); Constant *Target = llvm::ConstantExpr::getPtrToInt(Element, IntPtrTy); @@ -148,7 +163,8 @@ static GlobalVariable *createRelLookupTable(Function &Func, return RelLookupTable; } -static void convertToRelLookupTable(GlobalVariable &LookupTable) { +static void convertToRelLookupTable(LookupTableInfo &Info, + GlobalVariable &LookupTable) { GetElementPtrInst *GEP = cast<GetElementPtrInst>(LookupTable.use_begin()->getUser()); LoadInst *Load = cast<LoadInst>(GEP->use_begin()->getUser()); @@ -159,21 +175,21 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) { Function &Func = *BB->getParent(); // Generate an array that consists of relative offsets. - GlobalVariable *RelLookupTable = createRelLookupTable(Func, LookupTable); + GlobalVariable *RelLookupTable = + createRelLookupTable(Info, Func, LookupTable); // Place new instruction sequence before GEP. Builder.SetInsertPoint(GEP); - Value *Index = GEP->getOperand(2); - IntegerType *IntTy = cast<IntegerType>(Index->getType()); - Value *Offset = - Builder.CreateShl(Index, ConstantInt::get(IntTy, 2), "reltable.shift"); + IntegerType *IntTy = cast<IntegerType>(Info.Index->getType()); + Value *Offset = Builder.CreateShl(Info.Index, ConstantInt::get(IntTy, 2), + "reltable.shift"); // Insert the call to load.relative intrinsic before LOAD. // GEP might not be immediately followed by a LOAD, like it can be hoisted // outside the loop or another instruction might be inserted them in between. Builder.SetInsertPoint(Load); Function *LoadRelIntrinsic = llvm::Intrinsic::getOrInsertDeclaration( - &M, Intrinsic::load_relative, {Index->getType()}); + &M, Intrinsic::load_relative, {Info.Index->getType()}); // Create a call to load.relative intrinsic that computes the target address // by adding base address (lookup table address) and relative offset. @@ -205,10 +221,11 @@ static bool convertToRelativeLookupTables( bool Changed = false; for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) { - if (!shouldConvertToRelLookupTable(M, GV)) + LookupTableInfo Info; + if (!shouldConvertToRelLookupTable(Info, M, GV)) continue; - convertToRelLookupTable(GV); + convertToRelLookupTable(Info, GV); // Remove the original lookup table. GV.eraseFromParent(); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 060ca92e559a..28befd0aa1ce 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #if LLVM_ENABLE_ABI_BREAKING_CHECKS @@ -175,6 +176,26 @@ SCEVExpander::findInsertPointAfter(Instruction *I, return IP; } +void SCEVExpander::eraseDeadInstructions(Value *Root) { + SmallVector<Value *> WorkList; + SmallPtrSet<Value *, 8> DeletedValues; + append_range(WorkList, getAllInsertedInstructions()); + while (!WorkList.empty()) { + Value *V = WorkList.pop_back_val(); + if (DeletedValues.contains(V)) + continue; + auto *I = dyn_cast<Instruction>(V); + if (!I || I == Root || !isInsertedInstruction(I) || + !isInstructionTriviallyDead(I)) + continue; + append_range(WorkList, I->operands()); + InsertedValues.erase(I); + InsertedPostIncValues.erase(I); + DeletedValues.insert(I); + I->eraseFromParent(); + } +} + BasicBlock::iterator SCEVExpander::GetOptimalInsertionPointForCastOf(Value *V) const { // Cast the argument at the beginning of the entry block, after @@ -1239,10 +1260,13 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { if (!isa<SCEVAddRecExpr>(ExitSCEV)) continue; Type *PhiTy = PN.getType(); - if (STy->isIntegerTy() && PhiTy->isPointerTy()) + if (STy->isIntegerTy() && PhiTy->isPointerTy()) { ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy); - else if (S->getType() != PN.getType()) + if (isa<SCEVCouldNotCompute>(ExitSCEV)) + continue; + } else if (S->getType() != PN.getType()) { continue; + } // Check if we can re-use the existing PN, by adjusting it with an expanded // offset, if the offset is simpler. @@ -2184,8 +2208,15 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, // negative. If Step is known to be positive or negative, only create // either 1. or 2. auto ComputeEndCheck = [&]() -> Value * { - // Checking <u 0 is always false. - if (!Signed && Start->isZero() && SE.isKnownPositive(Step)) + // Checking <u 0 is always false, if (Step * trunc ExitCount) does not wrap. + // TODO: Predicates that can be proven true/false should be discarded when + // the predicates are created, not late during expansion. + if (!Signed && Start->isZero() && SE.isKnownPositive(Step) && + DstBits < SrcBits && + ExitCount == SE.getZeroExtendExpr(SE.getTruncateExpr(ExitCount, ARTy), + ExitCount->getType()) && + SE.willNotOverflow(Instruction::Mul, Signed, Step, + SE.getTruncateExpr(ExitCount, ARTy))) return ConstantInt::getFalse(Loc->getContext()); // Get the backedge taken count and truncate or extended to the AR type. diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 7a538ae2c583..970f85378d3d 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -612,6 +612,18 @@ private: /// If CompValue is already set, the function is expected to fail if a match /// is found but the value compared to is different. bool matchInstruction(Instruction *I, bool isEQ) { + if (match(I, m_Not(m_Instruction(I)))) + isEQ = !isEQ; + + Value *Val; + if (match(I, m_NUWTrunc(m_Value(Val)))) { + // If we already have a value for the switch, it has to match! + if (!setValueOnce(Val)) + return false; + UsedICmps++; + Vals.push_back(ConstantInt::get(cast<IntegerType>(Val->getType()), isEQ)); + return true; + } // If this is an icmp against a constant, handle this as one of the cases. ICmpInst *ICI; ConstantInt *C; @@ -2260,10 +2272,6 @@ static bool canSinkInstructions( for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) { Value *Op = I0->getOperand(OI); - if (Op->getType()->isTokenTy()) - // Don't touch any operand of token type. - return false; - auto SameAsI0 = [&I0, OI](const Instruction *I) { assert(I->getNumOperands() == I0->getNumOperands()); return I->getOperand(OI) == I0->getOperand(OI); @@ -2764,8 +2772,7 @@ bool CompatibleSets::shouldBelongToSameSet(ArrayRef<InvokeInst *> Invokes) { Use &U1 = std::get<1>(Ops); if (U0 == U1) return false; - return U0->getType()->isTokenTy() || - !canReplaceOperandWithVariable(cast<Instruction>(U0.getUser()), + return !canReplaceOperandWithVariable(cast<Instruction>(U0.getUser()), U0.getOperandNo()); }; assert(Invokes.size() == 2 && "Always called with exactly two candidates."); @@ -4404,10 +4411,12 @@ static bool mergeConditionalStoreToAddress( // OK, we're going to sink the stores to PostBB. The store has to be // conditional though, so first create the predicate. - Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator()) - ->getCondition(); - Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator()) - ->getCondition(); + BranchInst *PBranch = + cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator()); + BranchInst *QBranch = + cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator()); + Value *PCond = PBranch->getCondition(); + Value *QCond = QBranch->getCondition(); Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(), PStore->getParent()); @@ -4418,13 +4427,11 @@ static bool mergeConditionalStoreToAddress( IRBuilder<> QB(PostBB, PostBBFirst); QB.SetCurrentDebugLocation(PostBBFirst->getStableDebugLoc()); - Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond); - Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond); + InvertPCond ^= (PStore->getParent() != PTB); + InvertQCond ^= (QStore->getParent() != QTB); + Value *PPred = InvertPCond ? QB.CreateNot(PCond) : PCond; + Value *QPred = InvertQCond ? QB.CreateNot(QCond) : QCond; - if (InvertPCond) - PPred = QB.CreateNot(PPred); - if (InvertQCond) - QPred = QB.CreateNot(QPred); Value *CombinedPred = QB.CreateOr(PPred, QPred); BasicBlock::iterator InsertPt = QB.GetInsertPoint(); @@ -4808,23 +4815,12 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, SelectInst *NV = cast<SelectInst>( Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux")); PN.setIncomingValue(PBBIdx, NV); - // Although the select has the same condition as PBI, the original branch - // weights for PBI do not apply to the new select because the select's - // 'logical' edges are incoming edges of the phi that is eliminated, not - // the outgoing edges of PBI. + // The select has the same condition as PBI, in the same BB. The + // probabilities don't change. if (HasWeights) { - uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight; - uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight; - uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight; - uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight; - // The weight to PredCommonDest should be PredCommon * SuccTotal. - // The weight to PredOtherDest should be PredOther * SuccCommon. - uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther), - PredOther * SuccCommon}; - - fitWeights(NewWeights); - - setBranchWeights(NV, NewWeights[0], NewWeights[1], + uint64_t TrueWeight = PBIOp ? PredFalseWeight : PredTrueWeight; + uint64_t FalseWeight = PBIOp ? PredTrueWeight : PredFalseWeight; + setBranchWeights(NV, TrueWeight, FalseWeight, /*IsExpected=*/false); } } @@ -6437,34 +6433,42 @@ static bool trySwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder, namespace { -/// This class represents a lookup table that can be used to replace a switch. -class SwitchLookupTable { +/// This class finds alternatives for switches to ultimately +/// replace the switch. +class SwitchReplacement { public: - /// Create a lookup table to use as a switch replacement with the contents - /// of Values, using DefaultValue to fill any holes in the table. - SwitchLookupTable( + /// Create a helper for optimizations to use as a switch replacement. + /// Find a better representation for the content of Values, + /// using DefaultValue to fill any holes in the table. + SwitchReplacement( Module &M, uint64_t TableSize, ConstantInt *Offset, const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName); - /// Build instructions with Builder to retrieve the value at - /// the position given by Index in the lookup table. - Value *buildLookup(Value *Index, IRBuilder<> &Builder, const DataLayout &DL); + /// Build instructions with Builder to retrieve values using Index + /// and replace the switch. + Value *replaceSwitch(Value *Index, IRBuilder<> &Builder, const DataLayout &DL, + Function *Func); /// Return true if a table with TableSize elements of /// type ElementType would fit in a target-legal register. static bool wouldFitInRegister(const DataLayout &DL, uint64_t TableSize, Type *ElementType); + /// Return the default value of the switch. + Constant *getDefaultValue(); + + /// Return true if the replacement is a lookup table. + bool isLookupTable(); + private: - // Depending on the contents of the table, it can be represented in - // different ways. + // Depending on the switch, there are different alternatives. enum { - // For tables where each element contains the same value, we just have to + // For switches where each case contains the same value, we just have to // store that single value and return it for each lookup. SingleValueKind, - // For tables where there is a linear relationship between table index + // For switches where there is a linear relationship between table index // and values. We calculate the result with a simple multiplication // and addition instead of a table lookup. LinearMapKind, @@ -6476,9 +6480,15 @@ private: // The table is stored as an array of values. Values are retrieved by load // instructions from the table. - ArrayKind + LookupTableKind } Kind; + // The default value of the switch. + Constant *DefaultValue; + + // The type of the output values. + Type *ValueType; + // For SingleValueKind, this is the single value. Constant *SingleValue = nullptr; @@ -6491,23 +6501,24 @@ private: ConstantInt *LinearMultiplier = nullptr; bool LinearMapValWrapped = false; - // For ArrayKind, this is the array. - GlobalVariable *Array = nullptr; + // For LookupTableKind, this is the table. + Constant *Initializer = nullptr; }; } // end anonymous namespace -SwitchLookupTable::SwitchLookupTable( +SwitchReplacement::SwitchReplacement( Module &M, uint64_t TableSize, ConstantInt *Offset, const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, - Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) { + Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) + : DefaultValue(DefaultValue) { assert(Values.size() && "Can't build lookup table without values!"); assert(TableSize >= Values.size() && "Can't fit values in table!"); // If all values in the table are equal, this is that value. SingleValue = Values.begin()->second; - Type *ValueType = Values.begin()->second->getType(); + ValueType = Values.begin()->second->getType(); // Build up the table contents. SmallVector<Constant *, 64> TableContents(TableSize); @@ -6597,7 +6608,6 @@ SwitchLookupTable::SwitchLookupTable( (void)M.smul_ov(APInt(M.getBitWidth(), TableSize - 1), MayWrap); LinearMapValWrapped = NonMonotonic || MayWrap; Kind = LinearMapKind; - ++NumLinearMaps; return; } } @@ -6617,30 +6627,23 @@ SwitchLookupTable::SwitchLookupTable( BitMap = ConstantInt::get(M.getContext(), TableInt); BitMapElementTy = IT; Kind = BitMapKind; - ++NumBitMaps; return; } // Store the table in an array. - ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize); - Constant *Initializer = ConstantArray::get(ArrayTy, TableContents); - - Array = new GlobalVariable(M, ArrayTy, /*isConstant=*/true, - GlobalVariable::PrivateLinkage, Initializer, - "switch.table." + FuncName); - Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - // Set the alignment to that of an array items. We will be only loading one - // value out of it. - Array->setAlignment(DL.getPrefTypeAlign(ValueType)); - Kind = ArrayKind; + auto *TableTy = ArrayType::get(ValueType, TableSize); + Initializer = ConstantArray::get(TableTy, TableContents); + + Kind = LookupTableKind; } -Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder, - const DataLayout &DL) { +Value *SwitchReplacement::replaceSwitch(Value *Index, IRBuilder<> &Builder, + const DataLayout &DL, Function *Func) { switch (Kind) { case SingleValueKind: return SingleValue; case LinearMapKind: { + ++NumLinearMaps; // Derive the result value from the input value. Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(), false, "switch.idx.cast"); @@ -6656,6 +6659,7 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder, return Result; } case BitMapKind: { + ++NumBitMaps; // Type of the bitmap (e.g. i59). IntegerType *MapTy = BitMap->getIntegerType(); @@ -6677,9 +6681,18 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder, // Mask off. return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked"); } - case ArrayKind: { - Type *IndexTy = DL.getIndexType(Array->getType()); - auto *ArrayTy = cast<ArrayType>(Array->getValueType()); + case LookupTableKind: { + ++NumLookupTables; + auto *Table = + new GlobalVariable(*Func->getParent(), Initializer->getType(), + /*isConstant=*/true, GlobalVariable::PrivateLinkage, + Initializer, "switch.table." + Func->getName()); + Table->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + // Set the alignment to that of an array items. We will be only loading one + // value out of it. + Table->setAlignment(DL.getPrefTypeAlign(ValueType)); + Type *IndexTy = DL.getIndexType(Table->getType()); + auto *ArrayTy = cast<ArrayType>(Table->getValueType()); if (Index->getType() != IndexTy) { unsigned OldBitWidth = Index->getType()->getIntegerBitWidth(); @@ -6691,14 +6704,14 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder, Value *GEPIndices[] = {ConstantInt::get(IndexTy, 0), Index}; Value *GEP = - Builder.CreateInBoundsGEP(ArrayTy, Array, GEPIndices, "switch.gep"); + Builder.CreateInBoundsGEP(ArrayTy, Table, GEPIndices, "switch.gep"); return Builder.CreateLoad(ArrayTy->getElementType(), GEP, "switch.load"); } } - llvm_unreachable("Unknown lookup table kind!"); + llvm_unreachable("Unknown helper kind!"); } -bool SwitchLookupTable::wouldFitInRegister(const DataLayout &DL, +bool SwitchReplacement::wouldFitInRegister(const DataLayout &DL, uint64_t TableSize, Type *ElementType) { auto *IT = dyn_cast<IntegerType>(ElementType); @@ -6734,6 +6747,10 @@ static bool isTypeLegalForLookupTable(Type *Ty, const TargetTransformInfo &TTI, DL.fitsInLegalInteger(IT->getBitWidth()); } +Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; } + +bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; } + static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) { // 40% is the default density for building a jump table in optsize/minsize // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this @@ -6760,25 +6777,23 @@ static bool isSwitchDense(ArrayRef<int64_t> Values) { // TODO: We could support larger than legal types by limiting based on the // number of loads required and/or table size. If the constants are small we // could use smaller table entries and extend after the load. -static bool -shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, - const TargetTransformInfo &TTI, const DataLayout &DL, - const SmallDenseMap<PHINode *, Type *> &ResultTypes) { +static bool shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, + const TargetTransformInfo &TTI, + const DataLayout &DL, + const SmallVector<Type *> &ResultTypes) { if (SI->getNumCases() > TableSize) return false; // TableSize overflowed. bool AllTablesFitInRegister = true; bool HasIllegalType = false; - for (const auto &I : ResultTypes) { - Type *Ty = I.second; - + for (const auto &Ty : ResultTypes) { // Saturate this flag to true. HasIllegalType = HasIllegalType || !isTypeLegalForLookupTable(Ty, TTI, DL); // Saturate this flag to false. AllTablesFitInRegister = AllTablesFitInRegister && - SwitchLookupTable::wouldFitInRegister(DL, TableSize, Ty); + SwitchReplacement::wouldFitInRegister(DL, TableSize, Ty); // If both flags saturate, we're done. NOTE: This *only* works with // saturating flags, and all flags have to saturate first due to the @@ -6800,7 +6815,7 @@ shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, static bool shouldUseSwitchConditionAsTableIndex( ConstantInt &MinCaseVal, const ConstantInt &MaxCaseVal, - bool HasDefaultResults, const SmallDenseMap<PHINode *, Type *> &ResultTypes, + bool HasDefaultResults, const SmallVector<Type *> &ResultTypes, const DataLayout &DL, const TargetTransformInfo &TTI) { if (MinCaseVal.isNullValue()) return true; @@ -6808,10 +6823,9 @@ static bool shouldUseSwitchConditionAsTableIndex( MaxCaseVal.getLimitedValue() == std::numeric_limits<uint64_t>::max() || !HasDefaultResults) return false; - return all_of(ResultTypes, [&](const auto &KV) { - return SwitchLookupTable::wouldFitInRegister( - DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */, - KV.second /* ResultType */); + return all_of(ResultTypes, [&](const auto &ResultType) { + return SwitchReplacement::wouldFitInRegister( + DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */, ResultType); }); } @@ -6900,18 +6914,13 @@ static void reuseTableCompare( /// If the switch is only used to initialize one or more phi nodes in a common /// successor block with different constant values, replace the switch with /// lookup tables. -static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, - DomTreeUpdater *DTU, const DataLayout &DL, - const TargetTransformInfo &TTI) { +static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, + DomTreeUpdater *DTU, const DataLayout &DL, + const TargetTransformInfo &TTI) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); BasicBlock *BB = SI->getParent(); Function *Fn = BB->getParent(); - // Only build lookup table when we have a target that supports it or the - // attribute is not set. - if (!TTI.shouldBuildLookupTables() || - (Fn->getFnAttribute("no-jump-tables").getValueAsBool())) - return false; // FIXME: If the switch is too sparse for a lookup table, perhaps we could // split off a dense part and build a lookup table for that. @@ -6938,7 +6947,7 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, SmallDenseMap<PHINode *, ResultListTy> ResultLists; SmallDenseMap<PHINode *, Constant *> DefaultResults; - SmallDenseMap<PHINode *, Type *> ResultTypes; + SmallVector<Type *> ResultTypes; SmallVector<PHINode *, 4> PHIs; for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) { @@ -6955,7 +6964,8 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, Results, DL, TTI)) return false; - // Append the result from this case to the list for each phi. + // Append the result and result types from this case to the list for each + // phi. for (const auto &I : Results) { PHINode *PHI = I.first; Constant *Value = I.second; @@ -6963,23 +6973,16 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, if (Inserted) PHIs.push_back(PHI); It->second.push_back(std::make_pair(CaseVal, Value)); + ResultTypes.push_back(PHI->getType()); } } - // Keep track of the result types. - for (PHINode *PHI : PHIs) { - ResultTypes[PHI] = ResultLists[PHI][0].second->getType(); - } - - uint64_t NumResults = ResultLists[PHIs[0]].size(); - // If the table has holes, we need a constant result for the default case // or a bitmask that fits in a register. SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList; bool HasDefaultResults = getCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResultsList, DL, TTI); - for (const auto &I : DefaultResultsList) { PHINode *PHI = I.first; Constant *Result = I.second; @@ -6989,15 +6992,21 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, bool UseSwitchConditionAsTableIndex = shouldUseSwitchConditionAsTableIndex( *MinCaseVal, *MaxCaseVal, HasDefaultResults, ResultTypes, DL, TTI); uint64_t TableSize; - if (UseSwitchConditionAsTableIndex) + ConstantInt *TableIndexOffset; + if (UseSwitchConditionAsTableIndex) { TableSize = MaxCaseVal->getLimitedValue() + 1; - else + TableIndexOffset = ConstantInt::get(MaxCaseVal->getIntegerType(), 0); + } else { TableSize = (MaxCaseVal->getValue() - MinCaseVal->getValue()).getLimitedValue() + 1; + TableIndexOffset = MinCaseVal; + } + // If the default destination is unreachable, or if the lookup table covers // all values of the conditional variable, branch directly to the lookup table // BB. Otherwise, check that the condition is within the case range. + uint64_t NumResults = ResultLists[PHIs[0]].size(); bool DefaultIsReachable = !SI->defaultDestUnreachable(); bool TableHasHoles = (NumResults < TableSize); @@ -7025,68 +7034,100 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, if (!shouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes)) return false; - std::vector<DominatorTree::UpdateType> Updates; - - // Compute the maximum table size representable by the integer type we are - // switching upon. - unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits(); - uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize; - assert(MaxTableSize >= TableSize && - "It is impossible for a switch to have more entries than the max " - "representable value of its input integer type's size."); - - // Create the BB that does the lookups. - Module &Mod = *CommonDest->getParent()->getParent(); - BasicBlock *LookupBB = BasicBlock::Create( - Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest); - // Compute the table index value. - Builder.SetInsertPoint(SI); Value *TableIndex; - ConstantInt *TableIndexOffset; if (UseSwitchConditionAsTableIndex) { - TableIndexOffset = ConstantInt::get(MaxCaseVal->getIntegerType(), 0); TableIndex = SI->getCondition(); - } else { - TableIndexOffset = MinCaseVal; + if (HasDefaultResults) { + // Grow the table to cover all possible index values to avoid the range + // check. It will use the default result to fill in the table hole later, + // so make sure it exist. + ConstantRange CR = + computeConstantRange(TableIndex, /* ForSigned */ false); + // Grow the table shouldn't have any size impact by checking + // wouldFitInRegister. + // TODO: Consider growing the table also when it doesn't fit in a register + // if no optsize is specified. + const uint64_t UpperBound = CR.getUpper().getLimitedValue(); + if (!CR.isUpperWrapped() && + all_of(ResultTypes, [&](const auto &ResultType) { + return SwitchReplacement::wouldFitInRegister(DL, UpperBound, + ResultType); + })) { + // There may be some case index larger than the UpperBound (unreachable + // case), so make sure the table size does not get smaller. + TableSize = std::max(UpperBound, TableSize); + // The default branch is unreachable after we enlarge the lookup table. + // Adjust DefaultIsReachable to reuse code path. + DefaultIsReachable = false; + } + } + } + + // Keep track of the switch replacement for each phi + SmallDenseMap<PHINode *, SwitchReplacement> PhiToReplacementMap; + for (PHINode *PHI : PHIs) { + const auto &ResultList = ResultLists[PHI]; + + Type *ResultType = ResultList.begin()->second->getType(); + // Use any value to fill the lookup table holes. + Constant *DefaultVal = + AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI]; + StringRef FuncName = Fn->getName(); + SwitchReplacement Replacement(*Fn->getParent(), TableSize, TableIndexOffset, + ResultList, DefaultVal, DL, FuncName); + PhiToReplacementMap.insert({PHI, Replacement}); + } + + bool AnyLookupTables = any_of( + PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); }); + + // A few conditions prevent the generation of lookup tables: + // 1. The target does not support lookup tables. + // 2. The "no-jump-tables" function attribute is set. + // However, these objections do not apply to other switch replacements, like + // the bitmap, so we only stop here if any of these conditions are met and we + // want to create a LUT. Otherwise, continue with the switch replacement. + if (AnyLookupTables && + (!TTI.shouldBuildLookupTables() || + Fn->getFnAttribute("no-jump-tables").getValueAsBool())) + return false; + + Builder.SetInsertPoint(SI); + // TableIndex is the switch condition - TableIndexOffset if we don't + // use the condition directly + if (!UseSwitchConditionAsTableIndex) { // If the default is unreachable, all case values are s>= MinCaseVal. Then // we can try to attach nsw. bool MayWrap = true; if (!DefaultIsReachable) { - APInt Res = MaxCaseVal->getValue().ssub_ov(MinCaseVal->getValue(), MayWrap); + APInt Res = + MaxCaseVal->getValue().ssub_ov(MinCaseVal->getValue(), MayWrap); (void)Res; } - TableIndex = Builder.CreateSub(SI->getCondition(), TableIndexOffset, "switch.tableidx", /*HasNUW =*/false, /*HasNSW =*/!MayWrap); } - BranchInst *RangeCheckBranch = nullptr; + std::vector<DominatorTree::UpdateType> Updates; - // Grow the table to cover all possible index values to avoid the range check. - // It will use the default result to fill in the table hole later, so make - // sure it exist. - if (UseSwitchConditionAsTableIndex && HasDefaultResults) { - ConstantRange CR = computeConstantRange(TableIndex, /* ForSigned */ false); - // Grow the table shouldn't have any size impact by checking - // wouldFitInRegister. - // TODO: Consider growing the table also when it doesn't fit in a register - // if no optsize is specified. - const uint64_t UpperBound = CR.getUpper().getLimitedValue(); - if (!CR.isUpperWrapped() && all_of(ResultTypes, [&](const auto &KV) { - return SwitchLookupTable::wouldFitInRegister( - DL, UpperBound, KV.second /* ResultType */); - })) { - // There may be some case index larger than the UpperBound (unreachable - // case), so make sure the table size does not get smaller. - TableSize = std::max(UpperBound, TableSize); - // The default branch is unreachable after we enlarge the lookup table. - // Adjust DefaultIsReachable to reuse code path. - DefaultIsReachable = false; - } - } + // Compute the maximum table size representable by the integer type we are + // switching upon. + unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits(); + uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize; + assert(MaxTableSize >= TableSize && + "It is impossible for a switch to have more entries than the max " + "representable value of its input integer type's size."); + + // Create the BB that does the lookups. + Module &Mod = *CommonDest->getParent()->getParent(); + BasicBlock *LookupBB = BasicBlock::Create( + Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest); + + BranchInst *RangeCheckBranch = nullptr; + Builder.SetInsertPoint(SI); const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize); if (!DefaultIsReachable || GeneratingCoveredLookupTable) { Builder.CreateBr(LookupBB); @@ -7157,25 +7198,16 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, for (PHINode *PHI : PHIs) { const ResultListTy &ResultList = ResultLists[PHI]; - - Type *ResultType = ResultList.begin()->second->getType(); - - // Use any value to fill the lookup table holes. - Constant *DV = - AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI]; - StringRef FuncName = Fn->getName(); - SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV, - DL, FuncName); - - Value *Result = Table.buildLookup(TableIndex, Builder, DL); - + auto Replacement = PhiToReplacementMap.at(PHI); + auto *Result = Replacement.replaceSwitch(TableIndex, Builder, DL, Fn); // Do a small peephole optimization: re-use the switch table compare if // possible. if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) { BasicBlock *PhiBlock = PHI->getParent(); // Search for compare instructions which use the phi. for (auto *User : PHI->users()) { - reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList); + reuseTableCompare(User, PhiBlock, RangeCheckBranch, + Replacement.getDefaultValue(), ResultList); } } @@ -7202,7 +7234,6 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, if (DTU) DTU->applyUpdates(Updates); - ++NumLookupTables; if (NeedMask) ++NumLookupTablesHoles; return true; @@ -7708,7 +7739,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { // CVP. Therefore, only apply this transformation during late stages of the // optimisation pipeline. if (Options.ConvertSwitchToLookupTable && - switchToLookupTable(SI, Builder, DTU, DL, TTI)) + simplifySwitchLookup(SI, Builder, DTU, DL, TTI)) return requestResimplify(); if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI)) diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 2d6a748f4507..8acebbaa5458 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -97,6 +97,10 @@ static cl::opt<unsigned, false, HotColdHintParser> static cl::opt<unsigned, false, HotColdHintParser> HotNewHintValue( "hot-new-hint-value", cl::Hidden, cl::init(254), cl::desc("Value to pass to hot/cold operator new for hot allocation")); +static cl::opt<unsigned, false, HotColdHintParser> AmbiguousNewHintValue( + "ambiguous-new-hint-value", cl::Hidden, cl::init(222), + cl::desc( + "Value to pass to hot/cold operator new for ambiguous allocation")); //===----------------------------------------------------------------------===// // Helper Functions @@ -1719,6 +1723,37 @@ Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) { return nullptr; } +// Allow existing calls to operator new() that takes a __hot_cold_t parameter to +// be updated with a compiler-determined hot cold hint value. This is used in +// cases where the call is marked nobuiltin (because operator new called +// explicitly) and therefore cannot be replaced with a different callee. +Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI, + IRBuilderBase &B) { + if (!OptimizeHotColdNew || !OptimizeExistingHotColdNew) + return nullptr; + Function *Callee = CI->getCalledFunction(); + if (!Callee) + return nullptr; + LibFunc Func; + if (!TLI->getLibFunc(*Callee, Func)) + return nullptr; + switch (Func) { + case LibFunc_Znwm12__hot_cold_t: + case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t: + case LibFunc_ZnwmSt11align_val_t12__hot_cold_t: + case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t: + case LibFunc_Znam12__hot_cold_t: + case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t: + case LibFunc_ZnamSt11align_val_t12__hot_cold_t: + case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t: + case LibFunc_size_returning_new_hot_cold: + case LibFunc_size_returning_new_aligned_hot_cold: + return optimizeNew(CI, B, Func); + default: + return nullptr; + } +} + // When enabled, replace operator new() calls marked with a hot or cold memprof // attribute with an operator new() call that takes a __hot_cold_t parameter. // Currently this is supported by the open source version of tcmalloc, see: @@ -1736,6 +1771,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, HotCold = NotColdNewHintValue; else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() == "hot") HotCold = HotNewHintValue; + else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() == + "ambiguous") + HotCold = AmbiguousNewHintValue; else return nullptr; @@ -1753,9 +1791,8 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_Znwm12__hot_cold_t, HotCold); break; case LibFunc_Znwm: - if (HotCold != NotColdNewHintValue) - return emitHotColdNew(CI->getArgOperand(0), B, TLI, - LibFunc_Znwm12__hot_cold_t, HotCold); + return emitHotColdNew(CI->getArgOperand(0), B, TLI, + LibFunc_Znwm12__hot_cold_t, HotCold); break; case LibFunc_Znam12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1763,9 +1800,8 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_Znam12__hot_cold_t, HotCold); break; case LibFunc_Znam: - if (HotCold != NotColdNewHintValue) - return emitHotColdNew(CI->getArgOperand(0), B, TLI, - LibFunc_Znam12__hot_cold_t, HotCold); + return emitHotColdNew(CI->getArgOperand(0), B, TLI, + LibFunc_Znam12__hot_cold_t, HotCold); break; case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1774,10 +1810,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold); break; case LibFunc_ZnwmRKSt9nothrow_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewNoThrow( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold); + return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B, + TLI, LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, + HotCold); break; case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1786,10 +1821,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold); break; case LibFunc_ZnamRKSt9nothrow_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewNoThrow( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold); + return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B, + TLI, LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, + HotCold); break; case LibFunc_ZnwmSt11align_val_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1798,10 +1832,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold); break; case LibFunc_ZnwmSt11align_val_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewAligned( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold); + return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B, + TLI, LibFunc_ZnwmSt11align_val_t12__hot_cold_t, + HotCold); break; case LibFunc_ZnamSt11align_val_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1810,10 +1843,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold); break; case LibFunc_ZnamSt11align_val_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewAligned( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold); + return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B, + TLI, LibFunc_ZnamSt11align_val_t12__hot_cold_t, + HotCold); break; case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1823,11 +1855,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, HotCold); break; case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewAlignedNoThrow( - CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, - TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t, - HotCold); + return emitHotColdNewAlignedNoThrow( + CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, + TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold); break; case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1837,17 +1867,14 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, HotCold); break; case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewAlignedNoThrow( - CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, - TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t, - HotCold); + return emitHotColdNewAlignedNoThrow( + CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, + TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold); break; case LibFunc_size_returning_new: - if (HotCold != NotColdNewHintValue) - return emitHotColdSizeReturningNew(CI->getArgOperand(0), B, TLI, - LibFunc_size_returning_new_hot_cold, - HotCold); + return emitHotColdSizeReturningNew(CI->getArgOperand(0), B, TLI, + LibFunc_size_returning_new_hot_cold, + HotCold); break; case LibFunc_size_returning_new_hot_cold: if (OptimizeExistingHotColdNew) @@ -1856,10 +1883,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, HotCold); break; case LibFunc_size_returning_new_aligned: - if (HotCold != NotColdNewHintValue) - return emitHotColdSizeReturningNewAligned( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_size_returning_new_aligned_hot_cold, HotCold); + return emitHotColdSizeReturningNewAligned( + CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, + LibFunc_size_returning_new_aligned_hot_cold, HotCold); break; case LibFunc_size_returning_new_aligned_hot_cold: if (OptimizeExistingHotColdNew) @@ -4094,8 +4120,11 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { // TODO: Split out the code below that operates on FP calls so that // we can all non-FP calls with the StrictFP attribute to be // optimized. - if (CI->isNoBuiltin()) - return nullptr; + if (CI->isNoBuiltin()) { + // If this is an existing call to a hot cold operator new, we can update the + // hint parameter value, which doesn't change the callee. + return optimizeExistingHotColdNew(CI, Builder); + } LibFunc Func; Function *Callee = CI->getCalledFunction(); diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp index d52d52a9b7d3..6319fd524ff0 100644 --- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp @@ -349,13 +349,7 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, KeyValue = Key->getValue(KeyStorage); if (KeyValue == "source") { - std::string Error; - Source = std::string(Value->getValue(ValueStorage)); - if (!Regex(Source).isValid(Error)) { - YS.printError(Field.getKey(), "invalid regex: " + Error); - return false; - } } else if (KeyValue == "target") { Target = std::string(Value->getValue(ValueStorage)); } else if (KeyValue == "transform") { @@ -379,12 +373,22 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, // TODO see if there is a more elegant solution to selecting the rewrite // descriptor type - if (!Target.empty()) + if (!Target.empty()) { DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>( Source, Target, Naked)); - else - DL->push_back( - std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform)); + return true; + } + + { + std::string Error; + if (!Regex(Source).isValid(Error)) { + YS.printError(Descriptor, "invalid Source regex: " + Error); + return false; + } + } + + DL->push_back( + std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform)); return true; } @@ -418,13 +422,7 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, KeyValue = Key->getValue(KeyStorage); if (KeyValue == "source") { - std::string Error; - Source = std::string(Value->getValue(ValueStorage)); - if (!Regex(Source).isValid(Error)) { - YS.printError(Field.getKey(), "invalid regex: " + Error); - return false; - } } else if (KeyValue == "target") { Target = std::string(Value->getValue(ValueStorage)); } else if (KeyValue == "transform") { @@ -441,13 +439,23 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, return false; } - if (!Target.empty()) + if (!Target.empty()) { DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>( Source, Target, /*Naked*/ false)); - else - DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>( - Source, Transform)); + return true; + } + + { + std::string Error; + if (!Regex(Source).isValid(Error)) { + YS.printError(Descriptor, "invalid Source regex: " + Error); + return false; + } + } + + DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>( + Source, Transform)); return true; } @@ -481,13 +489,7 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, KeyValue = Key->getValue(KeyStorage); if (KeyValue == "source") { - std::string Error; - Source = std::string(Value->getValue(ValueStorage)); - if (!Regex(Source).isValid(Error)) { - YS.printError(Field.getKey(), "invalid regex: " + Error); - return false; - } } else if (KeyValue == "target") { Target = std::string(Value->getValue(ValueStorage)); } else if (KeyValue == "transform") { @@ -504,13 +506,23 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, return false; } - if (!Target.empty()) + if (!Target.empty()) { DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>( Source, Target, /*Naked*/ false)); - else - DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>( - Source, Transform)); + return true; + } + + { + std::string Error; + if (!Regex(Source).isValid(Error)) { + YS.printError(Descriptor, "invalid Source regex: " + Error); + return false; + } + } + + DL->push_back( + std::make_unique<PatternRewriteNamedAliasDescriptor>(Source, Transform)); return true; } diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index 491f0b76f4ae..53129e2e5fbb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -170,10 +170,10 @@ private: bool recognizeFindFirstByte(); Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU, - unsigned VF, Type *CharTy, BasicBlock *ExitSucc, - BasicBlock *ExitFail, Value *SearchStart, - Value *SearchEnd, Value *NeedleStart, - Value *NeedleEnd); + unsigned VF, Type *CharTy, Value *IndPhi, + BasicBlock *ExitSucc, BasicBlock *ExitFail, + Value *SearchStart, Value *SearchEnd, + Value *NeedleStart, Value *NeedleEnd); void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc, BasicBlock *ExitFail, @@ -242,6 +242,37 @@ bool LoopIdiomVectorize::run(Loop *L) { return false; } +static void fixSuccessorPhis(Loop *L, Value *ScalarRes, Value *VectorRes, + BasicBlock *SuccBB, BasicBlock *IncBB) { + for (PHINode &PN : SuccBB->phis()) { + // Look through the incoming values to find ScalarRes, meaning this is a + // PHI collecting the results of the transformation. + bool ResPhi = false; + for (Value *Op : PN.incoming_values()) + if (Op == ScalarRes) { + ResPhi = true; + break; + } + + // Any PHI that depended upon the result of the transformation needs a new + // incoming value from IncBB. + if (ResPhi) + PN.addIncoming(VectorRes, IncBB); + else { + // There should be no other outside uses of other values in the + // original loop. Any incoming values should either: + // 1. Be for blocks outside the loop, which aren't interesting. Or .. + // 2. These are from blocks in the loop with values defined outside + // the loop. We should a similar incoming value from CmpBB. + for (BasicBlock *BB : PN.blocks()) + if (L->contains(BB)) { + PN.addIncoming(PN.getIncomingValueForBlock(BB), IncBB); + break; + } + } + } +} + bool LoopIdiomVectorize::recognizeByteCompare() { // Currently the transformation only works on scalable vector types, although // there is no fundamental reason why it cannot be made to work for fixed @@ -574,13 +605,8 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch( Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()}, {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load"); - StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE); - auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr); - Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS); - Value *VectorMatchCmp = Builder.CreateIntrinsic( - Intrinsic::vp_icmp, {VectorLhsLoad->getType()}, - {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr, - "mismatch.cmp"); + Value *VectorMatchCmp = + Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad, "mismatch.cmp"); Value *CTZ = Builder.CreateIntrinsic( Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()}, {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask, @@ -940,42 +966,10 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}}); } - auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { - for (PHINode &PN : SuccBB->phis()) { - // At this point we've already replaced all uses of the result from the - // loop with ByteCmp. Look through the incoming values to find ByteCmp, - // meaning this is a Phi collecting the results of the byte compare. - bool ResPhi = false; - for (Value *Op : PN.incoming_values()) - if (Op == ByteCmpRes) { - ResPhi = true; - break; - } - - // Any PHI that depended upon the result of the byte compare needs a new - // incoming value from CmpBB. This is because the original loop will get - // deleted. - if (ResPhi) - PN.addIncoming(ByteCmpRes, CmpBB); - else { - // There should be no other outside uses of other values in the - // original loop. Any incoming values should either: - // 1. Be for blocks outside the loop, which aren't interesting. Or .. - // 2. These are from blocks in the loop with values defined outside - // the loop. We should a similar incoming value from CmpBB. - for (BasicBlock *BB : PN.blocks()) - if (CurLoop->contains(BB)) { - PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB); - break; - } - } - } - }; - // Ensure all Phis in the successors of CmpBB have an incoming value from it. - fixSuccessorPhis(EndBB); + fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, EndBB, CmpBB); if (EndBB != FoundBB) - fixSuccessorPhis(FoundBB); + fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, FoundBB, CmpBB); // The new CmpBB block isn't part of the loop, but will need to be added to // the outer loop if there is one. @@ -1173,8 +1167,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() { Value *LoopIdiomVectorize::expandFindFirstByte( IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy, - BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart, - Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) { + Value *IndPhi, BasicBlock *ExitSucc, BasicBlock *ExitFail, + Value *SearchStart, Value *SearchEnd, Value *NeedleStart, + Value *NeedleEnd) { // Set up some types and constants that we intend to reuse. auto *PtrTy = Builder.getPtrTy(); auto *I64Ty = Builder.getInt64Ty(); @@ -1374,6 +1369,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte( MatchLCSSA->addIncoming(Search, BB2); MatchPredLCSSA->addIncoming(MatchPred, BB2); + // Ensure all Phis in the successors of BB3/BB5 have an incoming value from + // them. + fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitSucc, BB3); + if (ExitSucc != ExitFail) + fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitFail, BB5); + if (VerifyLoops) { OuterLoop->verifyLoop(); InnerLoop->verifyLoop(); @@ -1395,21 +1396,12 @@ void LoopIdiomVectorize::transformFindFirstByte( DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); - Value *MatchVal = - expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail, - SearchStart, SearchEnd, NeedleStart, NeedleEnd); + expandFindFirstByte(Builder, DTU, VF, CharTy, IndPhi, ExitSucc, ExitFail, + SearchStart, SearchEnd, NeedleStart, NeedleEnd); assert(PHBranch->isUnconditional() && "Expected preheader to terminate with an unconditional branch."); - // Add new incoming values with the result of the transformation to PHINodes - // of ExitSucc that use IndPhi. - for (auto *U : llvm::make_early_inc_range(IndPhi->users())) { - auto *PN = dyn_cast<PHINode>(U); - if (PN && PN->getParent() == ExitSucc) - PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent()); - } - if (VerifyLoops && CurLoop->getParentLoop()) { CurLoop->getParentLoop()->verifyLoop(); if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI)) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 789047a2a28e..2704e66f3a70 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -15,8 +15,10 @@ // #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() { }); } - if (!LAI->canVectorizeMemory()) + if (!LAI->canVectorizeMemory()) { + if (hasUncountableExitWithSideEffects()) { + reportVectorizationFailure( + "Cannot vectorize unsafe dependencies in uncountable exit loop with " + "side effects", + "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE, + TheLoop); + return false; + } + return canVectorizeIndirectUnsafeDependences(); + } if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { reportVectorizationFailure("We don't allow storing to uniform addresses", @@ -1530,7 +1542,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!isGuaranteedNotToBePoison(CurrV, AC, TheLoop->getLoopPredecessor() ->getTerminator() - ->getIterator())) + ->getIterator(), + DT)) return false; continue; } @@ -1754,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { } }; + bool HasSideEffects = false; for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { if (I.mayWriteToMemory()) { - // We don't support writes to memory. + if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) { + HasSideEffects = true; + continue; + } + + // We don't support complex writes to memory. reportVectorizationFailure( - "Writes to memory unsupported in early exit loops", - "Cannot vectorize early exit loop with writes to memory", + "Complex writes to memory unsupported in early exit loops", + "Cannot vectorize early exit loop with complex writes to memory", "WritesInEarlyExitLoop", ORE, TheLoop); return false; - } else if (!IsSafeOperation(&I)) { + } + + if (!IsSafeOperation(&I)) { reportVectorizationFailure("Early exit loop contains operations that " "cannot be speculatively executed", "UnsafeOperationsEarlyExitLoop", ORE, @@ -1776,15 +1797,37 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock && "Expected latch predecessor to be the early exiting block"); + SmallVector<LoadInst *, 4> NonDerefLoads; // TODO: Handle loops that may fault. - Predicates.clear(); - if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, - &Predicates)) { - reportVectorizationFailure( - "Loop may fault", - "Cannot vectorize potentially faulting early exit loop", - "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + if (!HasSideEffects) { + // Read-only loop. + Predicates.clear(); + if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads, + &Predicates)) { + reportVectorizationFailure( + "Loop may fault", "Cannot vectorize non-read-only early exit loop", + "NonReadOnlyEarlyExitLoop", ORE, TheLoop); + return false; + } + } else if (!canUncountableExitConditionLoadBeMoved( + SingleUncountableExitingBlock)) return false; + + // Check non-dereferenceable loads if any. + for (LoadInst *LI : NonDerefLoads) { + // Only support unit-stride access for now. + int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand()); + if (Stride != 1) { + reportVectorizationFailure( + "Loop contains potentially faulting strided load", + "Cannot vectorize early exit loop with " + "strided fault-only-first load", + "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop); + return false; + } + PotentiallyFaultingLoads.insert(LI); + LLVM_DEBUG(dbgs() << "LV: Found potentially faulting load: " << *LI + << "\n"); } [[maybe_unused]] const SCEV *SymbolicMaxBTC = @@ -1797,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { "backedge taken count: " << *SymbolicMaxBTC << '\n'); UncountableExitingBB = SingleUncountableExitingBlock; + UncountableExitWithSideEffects = HasSideEffects; + return true; +} + +bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved( + BasicBlock *ExitingBlock) { + // Try to find a load in the critical path for the uncountable exit condition. + // This is currently matching about the simplest form we can, expecting + // only one in-loop load, the result of which is directly compared against + // a loop-invariant value. + // FIXME: We're insisting on a single use for now, because otherwise we will + // need to make PHI nodes for other users. That can be done once the initial + // transform code lands. + auto *Br = cast<BranchInst>(ExitingBlock->getTerminator()); + + using namespace llvm::PatternMatch; + Instruction *L = nullptr; + Value *Ptr = nullptr; + Value *R = nullptr; + if (!match(Br->getCondition(), + m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))), + m_Value(R))))) { + reportVectorizationFailure( + "Early exit loop with store but no supported condition load", + "NoConditionLoadForEarlyExitLoop", ORE, TheLoop); + return false; + } + + // FIXME: Don't rely on operand ordering for the comparison. + if (!TheLoop->isLoopInvariant(R)) { + reportVectorizationFailure( + "Early exit loop with store but no supported condition load", + "NoConditionLoadForEarlyExitLoop", ORE, TheLoop); + return false; + } + + // Make sure that the load address is not loop invariant; we want an + // address calculation that we can rotate to the next vector iteration. + const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr); + if (!isa<SCEVAddRecExpr>(PtrScev)) { + reportVectorizationFailure( + "Uncountable exit condition depends on load with an address that is " + "not an add recurrence", + "EarlyExitLoadInvariantAddress", ORE, TheLoop); + return false; + } + + // FIXME: Support gathers after first-faulting load support lands. + SmallVector<const SCEVPredicate *, 4> Predicates; + LoadInst *Load = cast<LoadInst>(L); + if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC, + &Predicates)) { + reportVectorizationFailure( + "Loop may fault", + "Cannot vectorize potentially faulting early exit loop", + "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + return false; + } + + ICFLoopSafetyInfo SafetyInfo; + SafetyInfo.computeLoopSafetyInfo(TheLoop); + // We need to know that load will be executed before we can hoist a + // copy out to run just before the first iteration. + // FIXME: Currently, other restrictions prevent us from reaching this point + // with a loop where the uncountable exit condition is determined + // by a conditional load. + assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) && + "Unhandled control flow in uncountable exit loop with side effects"); + + // Prohibit any potential aliasing with any instruction in the loop which + // might store to memory. + // FIXME: Relax this constraint where possible. + for (auto *BB : TheLoop->blocks()) { + for (auto &I : *BB) { + if (&I == Load) + continue; + + if (I.mayWriteToMemory()) { + if (auto *SI = dyn_cast<StoreInst>(&I)) { + AliasResult AR = AA->alias(Ptr, SI->getPointerOperand()); + if (AR == AliasResult::NoAlias) + continue; + } + + reportVectorizationFailure( + "Cannot determine whether critical uncountable exit load address " + "does not alias with a memory write", + "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop); + return false; + } + } + } + return true; } @@ -1869,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { } else { if (!isVectorizableEarlyExitLoop()) { assert(!hasUncountableEarlyExit() && + !hasUncountableExitWithSideEffects() && "Must be false without vectorizable early-exit loop"); if (DoExtraAnalysis) Result = false; @@ -1887,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return false; } + // Bail out for state-changing loops with uncountable exits for now. + if (UncountableExitWithSideEffects) { + reportVectorizationFailure( + "Writes to memory unsupported in early exit loops", + "Cannot vectorize early exit loop with writes to memory", + "WritesInEarlyExitLoop", ORE, TheLoop); + return false; + } + if (Result) { LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" << (LAI->getRuntimePointerChecking()->Need diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 838476dcae66..d34d2ae7a0b3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -334,6 +334,10 @@ public: FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL)); } + VPExpandSCEVRecipe *createExpandSCEV(const SCEV *Expr) { + return tryInsertInstruction(new VPExpandSCEVRecipe(Expr)); + } + //===--------------------------------------------------------------------===// // RAII helpers. //===--------------------------------------------------------------------===// @@ -559,6 +563,20 @@ public: /// Emit remarks for recipes with invalid costs in the available VPlans. void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE); + /// Create a check to \p Plan to see if the vector loop should be executed + /// based on its trip count. + void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount) const; + + /// Update loop metadata and profile info for both the scalar remainder loop + /// and \p VectorLoop, if it exists. Keeps all loop hints from the original + /// loop on the vector loop and replaces vectorizer-specific metadata. + void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, + VPBasicBlock *HeaderVPBB, + bool VectorizingEpilogue, + unsigned EstimatedVFxUF, + bool DisableRuntimeUnroll); + protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is @@ -613,13 +631,15 @@ private: /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B. bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B, bool HasTail) const; + const VectorizationFactor &B, bool HasTail, + bool IsEpilogue = false) const; /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B in the context of vectorizing a loop with known \p MaxTripCount. bool isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, - const unsigned MaxTripCount, bool HasTail) const; + const unsigned MaxTripCount, bool HasTail, + bool IsEpilogue = false) const; /// Determines if we have the infrastructure to vectorize the loop and its /// epilogue, assuming the main loop is vectorized by \p VF. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a0f306c12754..3cff43a51029 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -165,15 +165,6 @@ using namespace SCEVPatternMatch; const char VerboseDebug[] = DEBUG_TYPE "-verbose"; #endif -/// @{ -/// Metadata attribute names -const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; -const char LLVMLoopVectorizeFollowupVectorized[] = - "llvm.loop.vectorize.followup_vectorized"; -const char LLVMLoopVectorizeFollowupEpilogue[] = - "llvm.loop.vectorize.followup_epilogue"; -/// @} - STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); @@ -500,26 +491,22 @@ public: InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, - ElementCount VecWidth, - ElementCount MinProfitableTripCount, - unsigned UnrollFactor, LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - GeneratedRTChecks &RTChecks, VPlan &Plan) + ElementCount VecWidth, unsigned UnrollFactor, + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, + VPlan &Plan) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC), - VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount), - UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM), - BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan), + VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), + Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan), VectorPHVPBB(cast<VPBasicBlock>( Plan.getVectorLoopRegion()->getSinglePredecessor())) {} virtual ~InnerLoopVectorizer() = default; - /// Create a new empty loop that will contain vectorized instructions later - /// on, while the old loop will be used as the scalar remainder. Control flow - /// is generated around the vectorized (and scalar epilogue) loops consisting - /// of various checks and bypasses. Return the pre-header block of the new - /// loop. In the case of epilogue vectorization, this function is overriden to - /// handle the more complex control flow around the loops. + /// Creates a basic block for the scalar preheader. Both + /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite + /// the method to create additional blocks and checks needed for epilogue + /// vectorization. virtual BasicBlock *createVectorizedLoopSkeleton(); /// Fix the vectorized code, taking care of header phi's, and more. @@ -536,38 +523,18 @@ public: /// count of the original loop for both main loop and epilogue vectorization. void setTripCount(Value *TC) { TripCount = TC; } - /// Return the additional bypass block which targets the scalar loop by - /// skipping the epilogue loop after completing the main loop. - BasicBlock *getAdditionalBypassBlock() const { - assert(AdditionalBypassBlock && - "Trying to access AdditionalBypassBlock but it has not been set"); - return AdditionalBypassBlock; - } - protected: friend class LoopVectorizationPlanner; - // Create a check to see if the vector loop should be executed - Value *createIterationCountCheck(ElementCount VF, unsigned UF) const; - - /// Emit a bypass check to see if the vector trip count is zero, including if - /// it overflows. - void emitIterationCountCheck(BasicBlock *Bypass); - - /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, - /// vector loop preheader, middle block and scalar preheader. - void createVectorLoopSkeleton(StringRef Prefix); + /// Create and return a new IR basic block for the scalar preheader whose name + /// is prefixed with \p Prefix. + BasicBlock *createScalarPreheader(StringRef Prefix); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. virtual void printDebugTracesAtStart() {} virtual void printDebugTracesAtEnd() {} - /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the - /// vector preheader and its predecessor, also connecting the new block to the - /// scalar preheader. - void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); - /// The original loop. Loop *OrigLoop; @@ -592,8 +559,6 @@ protected: /// vector elements. ElementCount VF; - ElementCount MinProfitableTripCount; - /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -603,18 +568,9 @@ protected: // --- Vectorization state --- - /// The vector-loop preheader. - BasicBlock *LoopVectorPreHeader = nullptr; - - /// The scalar-loop preheader. - BasicBlock *LoopScalarPreHeader = nullptr; - /// Trip count of the original loop. Value *TripCount = nullptr; - /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) - Value *VectorTripCount = nullptr; - /// The profitablity analysis. LoopVectorizationCostModel *Cost; @@ -626,11 +582,6 @@ protected: /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; - /// The additional bypass block which conditionally skips over the epilogue - /// loop after executing the main loop. Needed to resume inductions and - /// reductions during epilogue vectorization. - BasicBlock *AdditionalBypassBlock = nullptr; - VPlan &Plan; /// The vector preheader block of \p Plan, used as target for check blocks @@ -679,20 +630,8 @@ public: GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth, - MinProfitableTripCount, UnrollFactor, CM, BFI, PSI, - Checks, Plan), - EPI(EPI) {} - - // Override this function to handle the more complex control flow around the - // three loops. - BasicBlock *createVectorizedLoopSkeleton() final { - return createEpilogueVectorizedLoopSkeleton(); - } - - /// The interface for creating a vectorized skeleton using one of two - /// different strategies, each corresponding to one execution of the vplan - /// as described above. - virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + UnrollFactor, CM, BFI, PSI, Checks, Plan), + EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {} /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -701,6 +640,9 @@ public: /// iteration count of the loop is so small that the main vector loop is /// completely skipped. EpilogueLoopVectorizationInfo &EPI; + +protected: + ElementCount MinProfitableTripCount; }; /// A specialized derived class of inner loop vectorizer that performs @@ -720,14 +662,24 @@ public: BFI, PSI, Check, Plan, EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF) {} /// Implements the interface for creating a vectorized skeleton using the - /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final; + /// *main loop* strategy (i.e., the first pass of VPlan execution). + BasicBlock *createVectorizedLoopSkeleton() final; protected: + /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the + /// vector preheader and its predecessor, also connecting the new block to the + /// scalar preheader. + void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); + + // Create a check to see if the main vector loop should be executed + Value *createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF, + unsigned UF) const; + /// Emits an iteration count bypass check once for the main loop (when \p /// ForEpilogue is false) and once for the epilogue loop (when \p /// ForEpilogue is true). - BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); + BasicBlock *emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, + bool ForEpilogue); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; @@ -736,6 +688,11 @@ protected: // vectorization of *epilogue* loops in the process of vectorizing loops and // their epilogues. class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { + /// The additional bypass block which conditionally skips over the epilogue + /// loop after executing the main loop. Needed to resume inductions and + /// reductions during epilogue vectorization. + BasicBlock *AdditionalBypassBlock = nullptr; + public: EpilogueVectorizerEpilogueLoop( Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, @@ -749,14 +706,22 @@ public: TripCount = EPI.TripCount; } /// Implements the interface for creating a vectorized skeleton using the - /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final; + /// *epilogue loop* strategy (i.e., the second pass of VPlan execution). + BasicBlock *createVectorizedLoopSkeleton() final; + + /// Return the additional bypass block which targets the scalar loop by + /// skipping the epilogue loop after completing the main loop. + BasicBlock *getAdditionalBypassBlock() const { + assert(AdditionalBypassBlock && + "Trying to access AdditionalBypassBlock but it has not been set"); + return AdditionalBypassBlock; + } protected: /// Emits an iteration count bypass check after the main vector loop has /// finished to see if there are any iterations left to execute by either /// the vector epilogue or the scalar epilogue. - BasicBlock *emitMinimumVectorEpilogueIterCountCheck( + BasicBlock *emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert); void printDebugTracesAtStart() override; @@ -962,8 +927,8 @@ public: /// user options, for the given register kind. bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind); - /// \return True if register pressure should be calculated for the given VF. - bool shouldCalculateRegPressureForVF(ElementCount VF); + /// \return True if register pressure should be considered for the given VF. + bool shouldConsiderRegPressureForVF(ElementCount VF); /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as @@ -1159,7 +1124,10 @@ public: CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const { assert(!VF.isScalar() && "Expected vector VF"); - return CallWideningDecisions.at({CI, VF}); + auto I = CallWideningDecisions.find({CI, VF}); + if (I == CallWideningDecisions.end()) + return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0}; + return I->second; } /// Return True if instruction \p I is an optimizable truncate whose operand @@ -1682,7 +1650,9 @@ private: Instruction *I = dyn_cast<Instruction>(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I) || - getWideningDecision(I, VF) == CM_Scalarize) + getWideningDecision(I, VF) == CM_Scalarize || + (isa<CallInst>(I) && + getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1878,6 +1848,8 @@ public: "claimed checks are required"); } + SCEVExp.eraseDeadInstructions(SCEVCheckCond); + if (!MemCheckBlock && !SCEVCheckBlock) return; @@ -2030,7 +2002,7 @@ public: /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR /// outside VPlan. - std::pair<Value *, BasicBlock *> getSCEVChecks() { + std::pair<Value *, BasicBlock *> getSCEVChecks() const { using namespace llvm::PatternMatch; if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt())) return {nullptr, nullptr}; @@ -2040,7 +2012,7 @@ public: /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR /// outside VPlan. - std::pair<Value *, BasicBlock *> getMemRuntimeChecks() { + std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const { using namespace llvm::PatternMatch; if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt())) return {nullptr, nullptr}; @@ -2049,9 +2021,7 @@ public: /// Return true if any runtime checks have been added bool hasChecks() const { - using namespace llvm::PatternMatch; - return (SCEVCheckCond && !match(SCEVCheckCond, m_ZeroInt())) || - MemRuntimeCheckCond; + return getSCEVChecks().first || getMemRuntimeChecks().first; } }; } // namespace @@ -2276,7 +2246,8 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { return TTI.enableMaskedInterleavedAccessVectorization(); } -void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { +void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan( + BasicBlock *CheckIRBB) { // Note: The block with the minimum trip-count check is already connected // during earlier VPlan construction. VPBlockBase *ScalarPH = Plan.getScalarPreheader(); @@ -2300,8 +2271,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { } } -Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, - unsigned UF) const { +Value *EpilogueVectorizerMainLoop::createIterationCountCheck( + BasicBlock *VectorPH, ElementCount VF, unsigned UF) const { // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the // vector trip count is zero. This check also covers the case where adding one @@ -2312,7 +2283,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + BasicBlock *const TCCheckBlock = VectorPH; IRBuilder<InstSimplifyFolder> Builder( TCCheckBlock->getContext(), InstSimplifyFolder(TCCheckBlock->getDataLayout())); @@ -2371,25 +2342,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, return CheckMinIters; } -void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; - Value *CheckMinIters = createIterationCountCheck(VF, UF); - // Create new preheader for vector loop. - LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), - static_cast<DominatorTree *>(nullptr), LI, - nullptr, "vector.ph"); - - BranchInst &BI = - *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); - if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) - setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); - ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); - - assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() == - TCCheckBlock && - "Plan's entry must be TCCCheckBlock"); -} - /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All @@ -2410,20 +2362,19 @@ static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, return IRVPBB; } -void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { - LoopVectorPreHeader = OrigLoop->getLoopPreheader(); - assert(LoopVectorPreHeader && "Invalid loop structure"); +BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) { + BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); + assert(VectorPH && "Invalid loop structure"); assert((OrigLoop->getUniqueLatchExitBlock() || Cost->requiresScalarEpilogue(VF.isVector())) && "loops not exiting via the latch without required epilogue?"); - LoopScalarPreHeader = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - LI, nullptr, Twine(Prefix) + "scalar.ph"); // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock - // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar - // preheader may be unreachable at this point. Instead it is replaced in - // createVectorizedLoopSkeleton. + // wrapping the newly created scalar preheader here at the moment, because the + // Plan's scalar preheader may be unreachable at this point. Instead it is + // replaced in executePlan. + return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr, + Twine(Prefix) + "scalar.ph"); } /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV @@ -2464,54 +2415,9 @@ static void addFullyUnrolledInstructionsToIgnore( } BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { - /* - In this function we generate a new loop. The new loop will contain - the vectorized instructions while the old loop will continue to run the - scalar remainder. - - [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's - / | preheader are expanded here. Eventually all required SCEV - / | expansion should happen here. - / v - | [ ] <-- vector loop bypass (may consist of multiple blocks). - | / | - | / v - || [ ] <-- vector pre header. - |/ | - | v - | [ ] \ - | [ ]_| <-- vector loop (created during VPlan execution). - | | - | v - \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to - | | successors created during VPlan execution) - \/ | - /\ v - | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). - | | - (opt) v <-- edge from middle to exit iff epilogue is not required. - | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header - | | wrapped in VPIRBasicBlock). - \ | - \ v - >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) - ... - */ - - // Create an empty vector loop, and prepare basic blocks for the runtime - // checks. - createVectorLoopSkeleton(""); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. This check also covers the case where the - // backedge-taken count is uint##_max: adding one to it will overflow leading - // to an incorrect trip count of zero. In this (rare) case we will also jump - // to the scalar loop. - emitIterationCountCheck(LoopScalarPreHeader); - - replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); - return LoopVectorPreHeader; + // Create a new IR basic block for the scalar preheader. + BasicBlock *ScalarPH = createScalarPreheader(""); + return ScalarPH->getSinglePredecessor(); } namespace { @@ -2652,24 +2558,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Remove redundant induction instructions. cse(HeaderBB); - - // Set/update profile weights for the vector and remainder loops as original - // loop iterations are now distributed among them. Note that original loop - // becomes the scalar remainder loop after vectorization. - // - // For cases like foldTailByMasking() and requiresScalarEpiloque() we may - // end up getting slightly roughened result but that should be OK since - // profile is not inherently precise anyway. Note also possible bypass of - // vector code caused by legality checks is ignored, assigning all the weight - // to the vector loop, optimistically. - // - // For scalable vectorization we can't know at compile time how many - // iterations of the loop are handled in one vector iteration, so instead - // use the value of vscale used for tuning. - Loop *VectorLoop = LI->getLoopFor(HeaderBB); - unsigned EstimatedVFxUF = - estimateElementCount(VF * UF, Cost->getVScaleForTuning()); - setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); } void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { @@ -3020,19 +2908,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, toVectorTy(Type::getInt1Ty(I->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); - // Certain instructions can be cheaper to vectorize if they have a constant - // second vector operand. One example of this are shifts on x86. - Value *Op2 = I->getOperand(1); - auto Op2Info = TTI.getOperandInfo(Op2); - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && - Legal->isInvariant(Op2)) - Op2Info.Kind = TargetTransformInfo::OK_UniformValue; - SmallVector<const Value *, 4> Operands(I->operand_values()); SafeDivisorCost += TTI.getArithmeticInstrCost( - I->getOpcode(), VecTy, CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - Op2Info, Operands, I); + I->getOpcode(), VecTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + Operands, I); return {ScalarizationCost, SafeDivisorCost}; } @@ -3810,7 +3691,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } -bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF( +bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF( ElementCount VF) { if (!useMaxBandwidth(VF.isScalable() ? TargetTransformInfo::RGK_ScalableVector @@ -3939,7 +3820,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, const unsigned MaxTripCount, - bool HasTail) const { + bool HasTail, + bool IsEpilogue) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; @@ -3963,7 +3845,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. - bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && + bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) && A.Width.isScalable() && !B.Width.isScalable(); auto CmpFn = [PreferScalable](const InstructionCost &LHS, @@ -4001,10 +3883,11 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, - bool HasTail) const { + bool HasTail, + bool IsEpilogue) const { const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); - return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, - HasTail); + return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail, + IsEpilogue); } void LoopVectorizationPlanner::emitInvalidCostRemarks( @@ -4171,6 +4054,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenIntOrFpInductionSC: case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: + case VPDef::VPInterleaveEVLSC: case VPDef::VPInterleaveSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: @@ -4199,8 +4083,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, // If no def nor is a store, e.g., branches, continue - no value to check. if (R.getNumDefinedValues() == 0 && - !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( - &R)) + !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(&R)) continue; // For multi-def recipes, currently only interleaved loads, suffice to // check first def only. @@ -4255,8 +4138,9 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { P->vectorFactors().end()); SmallVector<VPRegisterUsage, 8> RUs; - if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) || - CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector)) + if (any_of(VFs, [this](ElementCount VF) { + return CM.shouldConsiderRegPressureForVF(VF); + })) RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore); for (unsigned I = 0; I < VFs.size(); I++) { @@ -4268,7 +4152,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { /// If the register pressure needs to be considered for VF, /// don't consider the VF as valid if it exceeds the number /// of registers for the target. - if (CM.shouldCalculateRegPressureForVF(VF) && + if (CM.shouldConsiderRegPressureForVF(VF) && RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) continue; @@ -4286,7 +4170,33 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { if (!VPI) continue; switch (VPI->getOpcode()) { - case VPInstruction::ActiveLaneMask: + // Selects are only modelled in the legacy cost model for safe + // divisors. + case Instruction::Select: { + VPValue *VPV = VPI->getVPSingleValue(); + if (VPV->getNumUsers() == 1) { + if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) { + switch (WR->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + continue; + default: + break; + } + } + } + C += VPI->cost(VF, CostCtx); + break; + } + case VPInstruction::ActiveLaneMask: { + unsigned Multiplier = + cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue()) + ->getZExtValue(); + C += VPI->cost(VF * Multiplier, CostCtx); + break; + } case VPInstruction::ExplicitVectorLength: C += VPI->cost(VF, CostCtx); break; @@ -4511,7 +4421,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( } if (Result.Width.isScalar() || - isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking())) + isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(), + /*IsEpilogue*/ true)) Result = NextVF; } @@ -5326,8 +5237,11 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); - const Value *Ptr = getLoadStorePointerOperand(I); - Type *PtrTy = toVectorTy(Ptr->getType(), VF); + Value *Ptr = getLoadStorePointerOperand(I); + Type *PtrTy = Ptr->getType(); + + if (!Legal->isUniform(Ptr, VF)) + PtrTy = toVectorTy(PtrTy, VF); return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, @@ -5483,7 +5397,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI::CastContextHint::None, CostKind, RedOp); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType, + CostKind); if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) @@ -5528,7 +5443,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType, + CostKind); InstructionCost ExtraExtCost = 0; if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; @@ -5547,7 +5463,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); + true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy, + CostKind); if (RedCost.isValid() && RedCost < MulCost + BaseCost) return I == RetI ? RedCost : 0; @@ -6262,10 +6179,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, assert(Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1); - SmallVector<const Value *, 2> Operands{Op0, Op1}; return TTI.getArithmeticInstrCost( - match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, - CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); + match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, + VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I); } Type *CondTy = SI->getCondition()->getType(); @@ -6495,7 +6411,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { })) continue; VecValuesToIgnore.insert(Op); - DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); + append_range(DeadInterleavePointerOps, Op->operands()); } for (const auto &[_, Ops] : DeadInvariantStoreOps) @@ -6555,7 +6471,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { ValuesToIgnore.insert(Op); VecValuesToIgnore.insert(Op); - DeadOps.append(Op->op_begin(), Op->op_end()); + append_range(DeadOps, Op->operands()); } // Ignore type-promoting instructions we identified during reduction @@ -6765,9 +6681,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { InstructionCost VPCostContext::getLegacyCost(Instruction *UI, ElementCount VF) const { - if (ForceTargetInstructionCost.getNumOccurrences()) - return InstructionCost(ForceTargetInstructionCost.getNumOccurrences()); - return CM.getInstructionCost(UI, VF); + InstructionCost Cost = CM.getInstructionCost(UI, VF); + if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences()) + return InstructionCost(ForceTargetInstructionCost); + return Cost; } bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I, @@ -7071,8 +6988,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { P->vectorFactors().end()); SmallVector<VPRegisterUsage, 8> RUs; - if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) || - CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector)) + if (any_of(VFs, [this](ElementCount VF) { + return CM.shouldConsiderRegPressureForVF(VF); + })) RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore); for (unsigned I = 0; I < VFs.size(); I++) { @@ -7098,7 +7016,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { InstructionCost Cost = cost(*P, VF); VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); - if (CM.shouldCalculateRegPressureForVF(VF) && + if (CM.shouldConsiderRegPressureForVF(VF) && RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) { LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width " << VF << " because it uses too many registers\n"); @@ -7146,40 +7064,6 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { return BestFactor; } -static void addRuntimeUnrollDisableMetaData(Loop *L) { - SmallVector<Metadata *, 4> MDs; - // Reserve first location for self reference to the LoopID metadata node. - MDs.push_back(nullptr); - bool IsUnrollMetadata = false; - MDNode *LoopID = L->getLoopID(); - if (LoopID) { - // First find existing loop unrolling disable metadata. - for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { - auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); - if (MD) { - const auto *S = dyn_cast<MDString>(MD->getOperand(0)); - IsUnrollMetadata = - S && S->getString().starts_with("llvm.loop.unroll.disable"); - } - MDs.push_back(LoopID->getOperand(I)); - } - } - - if (!IsUnrollMetadata) { - // Add runtime unroll disable metadata. - LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Metadata *, 1> DisableOperands; - DisableOperands.push_back( - MDString::get(Context, "llvm.loop.unroll.runtime.disable")); - MDNode *DisableNode = MDNode::get(Context, DisableOperands); - MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - L->setLoopID(NewLoopID); - } -} - static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) { using namespace VPlanPatternMatch; assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult && @@ -7193,7 +7077,7 @@ static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) { // epilog loop, fix the reduction's scalar PHI node by adding the incoming value // from the main vector loop. static void fixReductionScalarResumeWhenVectorizingEpilog( - VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) { + VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) { // Get the VPInstruction computing the reduction result in the middle block. // The first operand may not be from the middle block if it is not connected // to the scalar preheader. In that case, there's nothing to fix. @@ -7248,8 +7132,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( // When fixing reductions in the epilogue loop we should already have // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry // over the incoming values correctly. - auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiR, true)); - EpiResumePhi->setIncomingValueForBlock( + EpiResumePhi.setIncomingValueForBlock( BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); } @@ -7276,11 +7159,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( BestVPlan, BestVF, VScale); } - if (!VectorizingEpilogue) { - // Checks are the same for all VPlans, added to BestVPlan only for - // compactness. - attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights); - } + // Checks are the same for all VPlans, added to BestVPlan only for + // compactness. + attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights); // Retrieving VectorPH now when it's easier while VPlan still has Regions. VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader()); @@ -7291,6 +7172,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); + VPlanTransforms::cse(BestVPlan); VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); @@ -7327,8 +7209,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - BasicBlock *EntryBB = - cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock(); State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(), State.CFG.PrevBB->getSingleSuccessor()); @@ -7342,7 +7222,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // looked through single-entry phis. ScalarEvolution &SE = *PSE.getSE(); for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) { - if (Exit->getNumPredecessors() == 0) + if (!Exit->hasPredecessors()) continue; for (VPRecipeBase &PhiR : Exit->phis()) SE.forgetLcssaPhiWithNewPredecessor( @@ -7362,88 +7242,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // //===------------------------------------------------===// - // Move check blocks to their final position. - // TODO: Move as part of VPIRBB execute and update impacted tests. - if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second) - MemCheckBlock->moveAfter(EntryBB); - if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second) - SCEVCheckBlock->moveAfter(EntryBB); - BestVPlan.execute(&State); - // 2.5 When vectorizing the epilogue, fix reduction resume values from the - // additional bypass block. - if (VectorizingEpilogue) { - assert(!BestVPlan.hasEarlyExit() && - "Epilogue vectorisation not yet supported with early exits"); - BasicBlock *PH = OrigLoop->getLoopPreheader(); - BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); - for (auto *Pred : predecessors(PH)) { - for (PHINode &Phi : PH->phis()) { - if (Phi.getBasicBlockIndex(Pred) != -1) - continue; - Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred); - } - } - VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader(); - if (ScalarPH->getNumPredecessors() > 0) { - // If ScalarPH has predecessors, we may need to update its reduction - // resume values. - for (VPRecipeBase &R : ScalarPH->phis()) { - fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), State, - BypassBlock); - } - } - } - // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT); - if (HeaderVPBB) { - MDNode *OrigLoopID = OrigLoop->getLoopID(); - - std::optional<MDNode *> VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); - - Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); - if (VectorizedLoopID) { - L->setLoopID(*VectorizedLoopID); - } else { - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE); - Hints.setAlreadyVectorized(); - - // Check if it's EVL-vectorized and mark the corresponding metadata. - bool IsEVLVectorized = - llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) { - // Looking for the ExplictVectorLength VPInstruction. - if (const auto *VI = dyn_cast<VPInstruction>(&Recipe)) - return VI->getOpcode() == VPInstruction::ExplicitVectorLength; - return false; - }); - if (IsEVLVectorized) { - LLVMContext &Context = L->getHeader()->getContext(); - MDNode *LoopID = L->getLoopID(); - auto *IsEVLVectorizedMD = MDNode::get( - Context, - {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"), - MDString::get(Context, "evl")}); - MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {}, - {IsEVLVectorizedMD}); - L->setLoopID(NewLoopID); - } - } - TargetTransformInfo::UnrollingPreferences UP; - TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); - if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) - addRuntimeUnrollDisableMetaData(L); - } + // Add metadata to disable runtime unrolling a scalar loop when there + // are no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar(); + updateLoopMetadataAndProfileInfo( + HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB)) + : nullptr, + HeaderVPBB, VectorizingEpilogue, + estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()), + DisableRuntimeUnroll); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7460,15 +7274,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { - createVectorLoopSkeleton(""); +BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() { + BasicBlock *ScalarPH = createScalarPreheader(""); + BasicBlock *VectorPH = ScalarPH->getSinglePredecessor(); // Generate the code to check the minimum iteration count of the vector // epilogue (see below). EPI.EpilogueIterationCountCheck = - emitIterationCountCheck(LoopScalarPreHeader, true); + emitIterationCountCheck(VectorPH, ScalarPH, true); EPI.EpilogueIterationCountCheck->setName("iter.check"); + VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator()) + ->getSuccessor(1); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case // that goes directly through the vector epilogue. The longer-path length for @@ -7476,9 +7293,10 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { // trip count. Note: the branch will get updated later on when we vectorize // the epilogue. EPI.MainLoopIterationCountCheck = - emitIterationCountCheck(LoopScalarPreHeader, false); + emitIterationCountCheck(VectorPH, ScalarPH, false); - return LoopVectorPreHeader; + return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator()) + ->getSuccessor(1); } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -7498,35 +7316,33 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { }); } -BasicBlock * -EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, - bool ForEpilogue) { +BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck( + BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) { assert(Bypass && "Expected valid bypass basic block."); Value *Count = getTripCount(); MinProfitableTripCount = ElementCount::getFixed(0); - Value *CheckMinIters = - createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF, - ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF); + Value *CheckMinIters = createIterationCountCheck( + VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF, + ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF); - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + BasicBlock *const TCCheckBlock = VectorPH; if (!ForEpilogue) TCCheckBlock->setName("vector.main.loop.iter.check"); // Create new preheader for vector loop. - LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), - static_cast<DominatorTree *>(nullptr), LI, - nullptr, "vector.ph"); + VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), + static_cast<DominatorTree *>(nullptr), LI, nullptr, + "vector.ph"); if (ForEpilogue) { // Save the trip count so we don't have to regenerate it in the // vec.epilog.iter.check. This is safe to do because the trip count // generated here dominates the vector epilog iter check. EPI.TripCount = Count; } else { - VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); + VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH); } - BranchInst &BI = - *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); @@ -7546,19 +7362,18 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock * -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { - createVectorLoopSkeleton("vec.epilog."); - +BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() { + BasicBlock *ScalarPH = createScalarPreheader("vec.epilog."); + BasicBlock *VectorPH = ScalarPH->getSinglePredecessor(); // Now, compare the remaining count and if there aren't enough iterations to // execute the vectorized epilogue skip to the scalar part. - LoopVectorPreHeader->setName("vec.epilog.ph"); + VectorPH->setName("vec.epilog.ph"); BasicBlock *VecEpilogueIterationCountCheck = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, - nullptr, "vec.epilog.iter.check", true); - VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); + SplitBlock(VectorPH, VectorPH->begin(), DT, LI, nullptr, + "vec.epilog.iter.check", true); + VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH); - emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, + emitMinimumVectorEpilogueIterCountCheck(VectorPH, ScalarPH, VecEpilogueIterationCountCheck); AdditionalBypassBlock = VecEpilogueIterationCountCheck; @@ -7567,23 +7382,22 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass."); EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopVectorPreHeader); + VecEpilogueIterationCountCheck, VectorPH); EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); + VecEpilogueIterationCountCheck, ScalarPH); // Adjust the terminators of runtime check blocks and phis using them. BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second; BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second; if (SCEVCheckBlock) SCEVCheckBlock->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); + VecEpilogueIterationCountCheck, ScalarPH); if (MemCheckBlock) MemCheckBlock->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); + VecEpilogueIterationCountCheck, ScalarPH); - DT->changeImmediateDominator(LoopScalarPreHeader, - EPI.EpilogueIterationCountCheck); + DT->changeImmediateDominator(ScalarPH, EPI.EpilogueIterationCountCheck); // The vec.epilog.iter.check block may contain Phi nodes from inductions or // reductions which merge control-flow from the latch block and the middle @@ -7592,7 +7406,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis())); for (PHINode *Phi : PhisInBlock) { - Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt()); + Phi->moveBefore(VectorPH->getFirstNonPHIIt()); Phi->replaceIncomingBlockWith( VecEpilogueIterationCountCheck->getSinglePredecessor(), VecEpilogueIterationCountCheck); @@ -7612,12 +7426,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { Phi->removeIncomingValue(MemCheckBlock); } - return LoopVectorPreHeader; + return VectorPH; } BasicBlock * EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( - BasicBlock *Bypass, BasicBlock *Insert) { + BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert) { assert(EPI.TripCount && "Expected trip count to have been saved in the first pass."); @@ -7637,23 +7451,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( EPI.EpilogueVF, EPI.EpilogueUF), "min.epilog.iters.check"); - BranchInst &BI = - *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); - if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - auto VScale = Cost->getVScaleForTuning(); - unsigned MainLoopStep = - estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale); - unsigned EpilogueLoopStep = - estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale); - // We assume the remaining `Count` is equally distributed in - // [0, MainLoopStep) - // So the probability for `Count < EpilogueLoopStep` should be - // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep - unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); - const uint32_t Weights[] = {EstimatedSkipCount, - MainLoopStep - EstimatedSkipCount}; - setBranchWeights(BI, Weights, /*IsExpected=*/false); - } + BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters); + auto VScale = Cost->getVScaleForTuning(); + unsigned MainLoopStep = + estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale); + unsigned EpilogueLoopStep = + estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale); + // We assume the remaining `Count` is equally distributed in + // [0, MainLoopStep) + // So the probability for `Count < EpilogueLoopStep` should be + // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep + // TODO: Improve the estimate by taking the estimated trip count into + // consideration. + unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); + const uint32_t Weights[] = {EstimatedSkipCount, + MainLoopStep - EstimatedSkipCount}; + setBranchWeights(BI, Weights, /*IsExpected=*/false); ReplaceInstWithInst(Insert->getTerminator(), &BI); // A new entry block has been created for the epilogue VPlan. Hook it in, as @@ -8634,8 +8447,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( return !CM.requiresScalarEpilogue(VF.isVector()); }, Range); - VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(), - Range); + VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit()); VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck, CM.foldTailByMasking()); @@ -8761,10 +8573,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range); - if (!Recipe) { - SmallVector<VPValue *, 4> Operands(R.operands()); - Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range); - } + if (!Recipe) + Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range); RecipeBuilder.setRecipe(Instr, Recipe); if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) { @@ -8790,7 +8600,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // to remove the need to keep a map of masks beyond the predication // transform. RecipeBuilder.updateBlockMaskCache(Old2New); - for (const auto &[Old, _] : Old2New) + for (VPValue *Old : Old2New.keys()) Old->getDefiningRecipe()->eraseFromParent(); assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && @@ -8851,41 +8661,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); - // Replace VPValues for known constant strides guaranteed by predicate scalar - // evolution. - auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { - auto *R = cast<VPRecipeBase>(&U); - return R->getParent()->getParent() || - R->getParent() == - Plan->getVectorLoopRegion()->getSinglePredecessor(); - }; - for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { - auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); - auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); - // Only handle constant strides for now. - if (!ScevStride) - continue; - - auto *CI = Plan->getOrAddLiveIn( - ConstantInt::get(Stride->getType(), ScevStride->getAPInt())); - if (VPValue *StrideVPV = Plan->getLiveIn(StrideV)) - StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); - - // The versioned value may not be used in the loop directly but through a - // sext/zext. Add new live-ins in those cases. - for (Value *U : StrideV->users()) { - if (!isa<SExtInst, ZExtInst>(U)) - continue; - VPValue *StrideVPV = Plan->getLiveIn(U); - if (!StrideVPV) - continue; - unsigned BW = U->getType()->getScalarSizeInBits(); - APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW) - : ScevStride->getAPInt().zext(BW); - VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C)); - StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); - } - } + // Replace VPValues for known constant strides. + VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE, + Legal->getLAI()->getSymbolicStrides()); auto BlockNeedsPredication = [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); @@ -8926,7 +8704,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { OrigLoop, *LI, Legal->getWidestInductionType(), getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); VPlanTransforms::handleEarlyExits(*Plan, - /*HasUncountableExit*/ false, Range); + /*HasUncountableExit*/ false); VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true, /*TailFolded*/ false); @@ -9316,7 +9094,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( void LoopVectorizationPlanner::attachRuntimeChecks( VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const { const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks(); - if (SCEVCheckBlock) { + if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) { assert((!CM.OptForSize || CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) && "Cannot SCEV check stride or overflow when optimizing for size"); @@ -9324,7 +9102,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks( HasBranchWeights); } const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks(); - if (MemCheckBlock) { + if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) { // VPlan-native path does not do any analysis for runtime checks // currently. assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) && @@ -9350,6 +9128,29 @@ void LoopVectorizationPlanner::attachRuntimeChecks( } } +void LoopVectorizationPlanner::addMinimumIterationCheck( + VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount) const { + // vscale is not necessarily a power-of-2, which means we cannot guarantee + // an overflow to zero when updating induction variables and so an + // additional overflow check is required before entering the vector loop. + bool IsIndvarOverflowCheckNeededForVF = + VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() && + !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) && + CM.getTailFoldingStyle() != + TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + const uint32_t *BranchWeigths = + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()) + ? &MinItersBypassWeights[0] + : nullptr; + VPlanTransforms::addMinimumIterationCheck( + Plan, VF, UF, MinProfitableTripCount, + CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(), + IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths, + OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), + *PSE.getSE()); +} + void VPDerivedIVRecipe::execute(VPTransformState &State) { assert(!State.Lane && "VPDerivedIVRecipe being replicated."); @@ -9465,17 +9266,18 @@ static bool processLoopInVPlanNativePath( { GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); - InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM, + InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM, BFI, PSI, Checks, BestPlan); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); + LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1, + VF.MinProfitableTripCount); + + LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false); } reportVectorization(ORE, L, VF, 1); - // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(); assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -9929,6 +9731,43 @@ static Value *createInductionAdditionalBypassValues( return EndValueFromAdditionalBypass; } +static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, + VPlan &BestEpiPlan, + LoopVectorizationLegality &LVL, + const SCEV2ValueTy &ExpandedSCEVs, + Value *MainVectorTripCount) { + // Fix reduction resume values from the additional bypass block. + BasicBlock *PH = L->getLoopPreheader(); + for (auto *Pred : predecessors(PH)) { + for (PHINode &Phi : PH->phis()) { + if (Phi.getBasicBlockIndex(Pred) != -1) + continue; + Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred); + } + } + auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader()); + if (ScalarPH->hasPredecessors()) { + // If ScalarPH has predecessors, we may need to update its reduction + // resume values. + for (const auto &[R, IRPhi] : + zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) { + fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), IRPhi, + BypassBlock); + } + } + + // Fix induction resume values from the additional bypass block. + IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt()); + for (const auto &[IVPhi, II] : LVL.getInductionVars()) { + auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); + Value *V = createInductionAdditionalBypassValues( + IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount, + LVL.getPrimaryInduction()); + // TODO: Directly add as extra operand to the VPResumePHI recipe. + Inc->setIncomingValueForBlock(BypassBlock, V); + } +} + bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); @@ -9971,7 +9810,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, - &Requirements, &Hints, DB, AC, BFI, PSI); + &Requirements, &Hints, DB, AC, BFI, PSI, AA); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); @@ -9985,6 +9824,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } + if (!LVL.getPotentiallyFaultingLoads().empty()) { + reportVectorizationFailure("Auto-vectorization of loops with potentially " + "faulting load is not supported", + "PotentiallyFaultingLoadsNotSupported", ORE, L); + return false; + } + // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before // even evaluating whether vectorization is profitable. Since we cannot modify @@ -10251,128 +10097,80 @@ bool LoopVectorizePass::processLoop(Loop *L) { LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } - bool DisableRuntimeUnroll = false; - MDNode *OrigLoopID = L->getLoopID(); - { + // Report the vectorization decision. + if (VF.Width.isScalar()) { using namespace ore; - if (!VectorizeLoop) { - assert(IC > 1 && "interleave count should not be 1 or 0"); - // If we decided that it is not legal to vectorize the loop, then - // interleave it. - VPlan &BestPlan = LVP.getPlanFor(VF.Width); - InnerLoopVectorizer Unroller( - L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1), - ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan); - - // TODO: Move to general VPlan pipeline once epilogue loops are also - // supported. - VPlanTransforms::runPass( - VPlanTransforms::materializeConstantVectorTripCount, BestPlan, - VF.Width, IC, PSE); - - LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); + assert(IC > 1); + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"; + }); + } else { + // Report the vectorization decision. + reportVectorization(ORE, L, VF, IC); + } + if (ORE->allowExtraAnalysis(LV_NAME)) + checkMixedPrecision(L, ORE); - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), - L->getHeader()) - << "interleaved loop (interleaved count: " - << NV("InterleaveCount", IC) << ")"; - }); - } else { - // If we decided that it is *legal* to vectorize the loop, then do it. - - VPlan &BestPlan = LVP.getPlanFor(VF.Width); - // Consider vectorizing the epilogue too if it's profitable. - VectorizationFactor EpilogueVF = - LVP.selectEpilogueVectorizationFactor(VF.Width, IC); - if (EpilogueVF.Width.isVector()) { - std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); - - // The first pass vectorizes the main loop and creates a scalar epilogue - // to be vectorized by executing the plan (potentially with a different - // factor) again shortly afterwards. - VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); - BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block"); - preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); - EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, - BestEpiPlan); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, - BFI, PSI, Checks, *BestMainPlan); - auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, - *BestMainPlan, MainILV, DT, false); - ++LoopsVectorized; - - // Second pass vectorizes the epilogue and adjusts the control flow - // edges from the first pass. - EpilogueVectorizerEpilogueLoop EpilogILV( - L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan); - EpilogILV.setTripCount(MainILV.getTripCount()); - preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); - - LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, - DT, true); - - // Fix induction resume values from the additional bypass block. - BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock(); - IRBuilder<> BypassBuilder(BypassBlock, - BypassBlock->getFirstInsertionPt()); - BasicBlock *PH = L->getLoopPreheader(); - for (const auto &[IVPhi, II] : LVL.getInductionVars()) { - auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); - Value *V = createInductionAdditionalBypassValues( - IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount, - LVL.getPrimaryInduction()); - // TODO: Directly add as extra operand to the VPResumePHI recipe. - Inc->setIncomingValueForBlock(BypassBlock, V); - } - ++LoopsEpilogueVectorized; + // If we decided that it is *legal* to interleave or vectorize the loop, then + // do it. - if (!Checks.hasChecks()) - DisableRuntimeUnroll = true; - } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, - VF.MinProfitableTripCount, IC, &CM, BFI, PSI, - Checks, BestPlan); - // TODO: Move to general VPlan pipeline once epilogue loops are also - // supported. - VPlanTransforms::runPass( - VPlanTransforms::materializeConstantVectorTripCount, BestPlan, - VF.Width, IC, PSE); - - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there - // are no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!Checks.hasChecks()) - DisableRuntimeUnroll = true; - } - // Report the vectorization decision. - reportVectorization(ORE, L, VF, IC); - } + VPlan &BestPlan = LVP.getPlanFor(VF.Width); + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + LVP.selectEpilogueVectorizationFactor(VF.Width, IC); + if (EpilogueVF.Width.isVector()) { + std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); + BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block"); + preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); + EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, + BestEpiPlan); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, + PSI, Checks, *BestMainPlan); + auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, + *BestMainPlan, MainILV, DT, false); + ++LoopsVectorized; + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, + BFI, PSI, Checks, BestEpiPlan); + EpilogILV.setTripCount(MainILV.getTripCount()); + preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); + + LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT, + true); + + fixScalarResumeValuesFromBypass(EpilogILV.getAdditionalBypassBlock(), L, + BestEpiPlan, LVL, ExpandedSCEVs, + EPI.VectorTripCount); + ++LoopsEpilogueVectorized; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI, + Checks, BestPlan); + // TODO: Move to general VPlan pipeline once epilogue loops are also + // supported. + VPlanTransforms::runPass( + VPlanTransforms::materializeConstantVectorTripCount, BestPlan, VF.Width, + IC, PSE); + LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC, + VF.MinProfitableTripCount); - if (ORE->allowExtraAnalysis(LV_NAME)) - checkMixedPrecision(L, ORE); + LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); + ++LoopsVectorized; } assert(DT->verify(DominatorTree::VerificationLevel::Fast) && "DT not preserved correctly"); + assert(!verifyFunction(*F, &dbgs())); - std::optional<MDNode *> RemainderLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupEpilogue}); - if (RemainderLoopID) { - L->setLoopID(*RemainderLoopID); - } else { - if (DisableRuntimeUnroll) - addRuntimeUnrollDisableMetaData(L); - - // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(); - } - - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -10449,6 +10247,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, DB = &AM.getResult<DemandedBitsAnalysis>(F); ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); LAIs = &AM.getResult<LoopAccessAnalysis>(F); + AA = &AM.getResult<AAManager>(F); auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 37dc41413966..6a56dbfaa015 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -967,9 +967,7 @@ class BinOpSameOpcodeHelper { return false; } bool equal(unsigned Opcode) { - if (Opcode == I->getOpcode()) - return trySet(MainOpBIT, MainOpBIT); - return false; + return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT); } unsigned getOpcode() const { MaskType Candidate = Mask & SeenBefore; @@ -5576,7 +5574,23 @@ private: if (auto *SD = dyn_cast<ScheduleData>(Data)) { SD->setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); - ProcessBundleMember(SD, {}); + SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles; + SmallVector<ScheduleBundle *> Bundles; + Instruction *In = SD->getInst(); + if (R.isVectorized(In)) { + ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In); + for (TreeEntry *TE : Entries) { + if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(In) && + In->getNumOperands() != TE->getNumOperands()) + continue; + auto &BundlePtr = + PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>()); + BundlePtr->setTreeEntry(TE); + BundlePtr->add(SD); + Bundles.push_back(BundlePtr.get()); + } + } + ProcessBundleMember(SD, Bundles); } else { ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data); Bundle.setScheduled(/*Scheduled=*/true); @@ -6325,17 +6339,11 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) { } /// Checks if the provided list of pointers \p Pointers represents the strided -/// pointers for type ElemTy. If they are not, std::nullopt is returned. -/// Otherwise, if \p Inst is not specified, just initialized optional value is -/// returned to show that the pointers represent strided pointers. If \p Inst -/// specified, the runtime stride is materialized before the given \p Inst. -/// \returns std::nullopt if the pointers are not pointers with the runtime -/// stride, nullptr or actual stride value, otherwise. -static std::optional<Value *> -calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, - const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl<unsigned> &SortedIndices, - Instruction *Inst = nullptr) { +/// pointers for type ElemTy. If they are not, nullptr is returned. +/// Otherwise, SCEV* of the stride value is returned. +static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl<unsigned> &SortedIndices) { SmallVector<const SCEV *> SCEVs; const SCEV *PtrSCEVLowest = nullptr; const SCEV *PtrSCEVHighest = nullptr; @@ -6344,7 +6352,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, for (Value *Ptr : PointerOps) { const SCEV *PtrSCEV = SE.getSCEV(Ptr); if (!PtrSCEV) - return std::nullopt; + return nullptr; SCEVs.push_back(PtrSCEV); if (!PtrSCEVLowest && !PtrSCEVHighest) { PtrSCEVLowest = PtrSCEVHighest = PtrSCEV; @@ -6352,14 +6360,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, } const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); if (isa<SCEVCouldNotCompute>(Diff)) - return std::nullopt; + return nullptr; if (Diff->isNonConstantNegative()) { PtrSCEVLowest = PtrSCEV; continue; } const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV); if (isa<SCEVCouldNotCompute>(Diff1)) - return std::nullopt; + return nullptr; if (Diff1->isNonConstantNegative()) { PtrSCEVHighest = PtrSCEV; continue; @@ -6368,7 +6376,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, // Dist = PtrSCEVHighest - PtrSCEVLowest; const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest); if (isa<SCEVCouldNotCompute>(Dist)) - return std::nullopt; + return nullptr; int Size = DL.getTypeStoreSize(ElemTy); auto TryGetStride = [&](const SCEV *Dist, const SCEV *Multiplier) -> const SCEV * { @@ -6389,10 +6397,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1)); Stride = TryGetStride(Dist, Sz); if (!Stride) - return std::nullopt; + return nullptr; } if (!Stride || isa<SCEVConstant>(Stride)) - return std::nullopt; + return nullptr; // Iterate through all pointers and check if all distances are // unique multiple of Stride. using DistOrdPair = std::pair<int64_t, int>; @@ -6406,28 +6414,28 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); const SCEV *Coeff = TryGetStride(Diff, Stride); if (!Coeff) - return std::nullopt; + return nullptr; const auto *SC = dyn_cast<SCEVConstant>(Coeff); if (!SC || isa<SCEVCouldNotCompute>(SC)) - return std::nullopt; + return nullptr; if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest, SE.getMulExpr(Stride, SC))) ->isZero()) - return std::nullopt; + return nullptr; Dist = SC->getAPInt().getZExtValue(); } // If the strides are not the same or repeated, we can't vectorize. if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size()) - return std::nullopt; + return nullptr; auto Res = Offsets.emplace(Dist, Cnt); if (!Res.second) - return std::nullopt; + return nullptr; // Consecutive order if the inserted element is the last one. IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end(); ++Cnt; } if (Offsets.size() != SCEVs.size()) - return std::nullopt; + return nullptr; SortedIndices.clear(); if (!IsConsecutive) { // Fill SortedIndices array only if it is non-consecutive. @@ -6438,10 +6446,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, ++Cnt; } } - if (!Inst) - return nullptr; - SCEVExpander Expander(SE, DL, "strided-load-vec"); - return Expander.expandCodeFor(Stride, Stride->getType(), Inst); + return Stride; } static std::pair<InstructionCost, InstructionCost> @@ -8030,11 +8035,11 @@ void BoUpSLP::reorderTopToBottom() { // it is an attempt to reorder node with reused scalars but with // external uses. if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { - OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += + OrdersUses.try_emplace(OrdersType(), 0).first->second += ExternalUserReorderIndices.size(); } else { for (const OrdersType &ExtOrder : ExternalUserReorderIndices) - ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + ++OrdersUses.try_emplace(ExtOrder, 0).first->second; } // No other useful reorder data in this entry. if (Order.empty()) @@ -8054,9 +8059,9 @@ void BoUpSLP::reorderTopToBottom() { return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); }); fixupOrderingIndices(CurrentOrder); - ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; + ++OrdersUses.try_emplace(CurrentOrder, 0).first->second; } else { - ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + ++OrdersUses.try_emplace(Order, 0).first->second; } } if (OrdersUses.empty()) @@ -8480,12 +8485,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); }); fixupOrderingIndices(CurrentOrder); - OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += - NumOps; + OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps; } else { - OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; + OrdersUses.try_emplace(Order, 0).first->second += NumOps; } - auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); + auto Res = OrdersUses.try_emplace(OrdersType(), 0); const auto AllowsReordering = [&](const TreeEntry *TE) { if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || @@ -10639,8 +10643,19 @@ class InstructionsCompatibilityAnalysis { } } } - if (MainOp) + if (MainOp) { + // Do not match, if any copyable is a terminator from the same block as + // the main operation. + if (any_of(VL, [&](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return I && I->getParent() == MainOp->getParent() && + I->isTerminator(); + })) { + MainOp = nullptr; + return; + } MainOpcode = MainOp->getOpcode(); + } } /// Returns the idempotent value for the \p MainOp with the detected \p @@ -11013,7 +11028,10 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( } SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars); if (all_of(VL, [&](Value *V) { - return isa<PoisonValue>(V) || Values.contains(V); + return isa<PoisonValue>(V) || Values.contains(V) || + (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) && + LI->getLoopFor(S.getMainOp()->getParent()) && + isVectorized(V)); })) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n"); return ScalarsVectorizationLegality(S, /*IsLegal=*/false); @@ -17835,6 +17853,17 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL)))); } + Value *getVectorizedValue(const TreeEntry &E) { + Value *Vec = E.VectorizedValue; + if (!Vec->getType()->isIntOrIntVectorTy()) + return Vec; + return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) { + return !isa<PoisonValue>(V) && + !isKnownNonNegative( + V, SimplifyQuery(*R.DL)); + })); + } + public: ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R) : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {} @@ -18001,35 +18030,14 @@ public: /// Adds 2 input vectors (in form of tree entries) and the mask for their /// shuffling. void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { - Value *V1 = E1.VectorizedValue; - if (V1->getType()->isIntOrIntVectorTy()) - V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); - Value *V2 = E2.VectorizedValue; - if (V2->getType()->isIntOrIntVectorTy()) - V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); + Value *V1 = getVectorizedValue(E1); + Value *V2 = getVectorizedValue(E2); add(V1, V2, Mask); } /// Adds single input vector (in form of tree entry) and the mask for its /// shuffling. void add(const TreeEntry &E1, ArrayRef<int> Mask) { - Value *V1 = E1.VectorizedValue; - if (V1->getType()->isIntOrIntVectorTy()) - V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); + Value *V1 = getVectorizedValue(E1); add(V1, Mask); } /// Adds 2 input vectors and the mask for their shuffling. @@ -18178,14 +18186,7 @@ public: auto CreateSubVectors = [&](Value *Vec, SmallVectorImpl<int> &CommonMask) { for (auto [E, Idx] : SubVectors) { - Value *V = E->VectorizedValue; - if (V->getType()->isIntOrIntVectorTy()) - V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); + Value *V = getVectorizedValue(*E); unsigned InsertionIndex = Idx * getNumElements(ScalarTy); // Use scalar version of the SCalarType to correctly handle shuffles // for revectorization. The revectorization mode operates by the @@ -19526,11 +19527,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return cast<LoadInst>(V)->getPointerOperand(); }); OrdersType Order; - std::optional<Value *> Stride = - calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order, - &*Builder.GetInsertPoint()); + const SCEV *StrideSCEV = + calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order); + assert(StrideSCEV && "At this point stride should be known"); + SCEVExpander Expander(*SE, *DL, "strided-load-vec"); + Value *Stride = Expander.expandCodeFor( + StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint()); Value *NewStride = - Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true); + Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true); StrideVal = Builder.CreateMul( NewStride, ConstantInt::get( @@ -20519,7 +20523,9 @@ Value *BoUpSLP::vectorizeTree( !(GatheredLoadsEntriesFirst.has_value() && IE->Idx >= *GatheredLoadsEntriesFirst && VectorizableTree.front()->isGather() && - is_contained(VectorizableTree.front()->Scalars, I))) + is_contained(VectorizableTree.front()->Scalars, I)) && + !(!VectorizableTree.front()->isGather() && + VectorizableTree.front()->isCopyableElement(I))) continue; SmallVector<SelectInst *> LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { @@ -20782,6 +20788,14 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, continue; } auto *SD = cast<ScheduleData>(SE); + if (SD->hasValidDependencies() && + (!S.areInstructionsWithCopyableElements() || + !S.isCopyableElement(SD->getInst())) && + !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE && + EI.UserTE->hasState() && + (!EI.UserTE->hasCopyableElements() || + !EI.UserTE->isCopyableElement(SD->getInst()))) + SD->clearDirectDependencies(); for (const Use &U : SD->getInst()->operands()) { unsigned &NumOps = UserOpToNumOps @@ -20791,7 +20805,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (auto *Op = dyn_cast<Instruction>(U.get()); Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op, *SLP, NumOps)) { - if (ScheduleData *OpSD = getScheduleData(Op)) { + if (ScheduleData *OpSD = getScheduleData(Op); + OpSD && OpSD->hasValidDependencies()) { OpSD->clearDirectDependencies(); if (RegionHasStackSave || !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst())) @@ -20977,7 +20992,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, ScheduleCopyableDataMapByUsers.erase(I); ScheduleCopyableDataMap.erase(KV); // Need to recalculate dependencies for the actual schedule data. - if (ScheduleData *OpSD = getScheduleData(I)) { + if (ScheduleData *OpSD = getScheduleData(I); + OpSD && OpSD->hasValidDependencies()) { OpSD->clearDirectDependencies(); if (RegionHasStackSave || !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst())) @@ -21881,6 +21897,10 @@ bool BoUpSLP::collectValuesToDemote( return TryProcessInstruction(BitWidth); case Instruction::ZExt: case Instruction::SExt: + if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() && + E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast && + E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy()) + return false; IsProfitableToDemote = true; return TryProcessInstruction(BitWidth); @@ -23797,9 +23817,7 @@ public: size_t Key, Idx; std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey, /*AllowAlternate=*/false); - ++PossibleReducedVals[Key][Idx] - .insert(std::make_pair(V, 0)) - .first->second; + ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second; } for (Instruction *I : reverse(PossibleReductionOps)) Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1); @@ -23820,21 +23838,20 @@ public: stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { return P1.size() > P2.size(); }); - int NewIdx = -1; + bool First = true; for (ArrayRef<Value *> Data : PossibleRedValsVect) { - if (NewIdx < 0 || - (!isGoodForReduction(Data) && - (!isa<LoadInst>(Data.front()) || - !isa<LoadInst>(ReducedVals[NewIdx].front()) || - getUnderlyingObject( - cast<LoadInst>(Data.front())->getPointerOperand()) != - getUnderlyingObject( - cast<LoadInst>(ReducedVals[NewIdx].front()) - ->getPointerOperand())))) { - NewIdx = ReducedVals.size(); + if (First) { + First = false; ReducedVals.emplace_back(); + } else if (!isGoodForReduction(Data)) { + auto *LI = dyn_cast<LoadInst>(Data.front()); + auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front()); + if (!LI || !LastLI || + getUnderlyingObject(LI->getPointerOperand()) != + getUnderlyingObject(LastLI->getPointerOperand())) + ReducedVals.emplace_back(); } - ReducedVals[NewIdx].append(Data.rbegin(), Data.rend()); + ReducedVals.back().append(Data.rbegin(), Data.rend()); } } // Sort the reduced values by number of same/alternate opcode and/or pointer @@ -23847,7 +23864,8 @@ public: /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, - const TargetLibraryInfo &TLI, AssumptionCache *AC) { + const TargetLibraryInfo &TLI, AssumptionCache *AC, + DominatorTree &DT) { constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; // If there are a sufficient number of reduction values, reduce @@ -24164,9 +24182,7 @@ public: // previous vectorization attempts. if (any_of(VL, [&V](Value *RedVal) { auto *RedValI = dyn_cast<Instruction>(RedVal); - if (!RedValI) - return false; - return V.isDeleted(RedValI); + return RedValI && V.isDeleted(RedValI); })) break; V.buildTree(VL, IgnoreList); @@ -24248,7 +24264,7 @@ public: // Estimate cost. InstructionCost ReductionCost = - getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V); + getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI); InstructionCost Cost = V.getTreeCost(VL, ReductionCost); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); @@ -24553,7 +24569,9 @@ private: InstructionCost getReductionCost(TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals, bool IsCmpSelMinMax, FastMathFlags FMF, - const BoUpSLP &R) { + const BoUpSLP &R, DominatorTree &DT, + const DataLayout &DL, + const TargetLibraryInfo &TLI) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *ScalarTy = ReducedVals.front()->getType(); unsigned ReduxWidth = ReducedVals.size(); @@ -24578,6 +24596,22 @@ private: for (User *U : RdxVal->users()) { auto *RdxOp = cast<Instruction>(U); if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { + if (RdxKind == RecurKind::FAdd) { + InstructionCost FMACost = canConvertToFMA( + RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI); + if (FMACost.isValid()) { + LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n"); + if (auto *I = dyn_cast<Instruction>(RdxVal)) { + // Also, exclude scalar fmul cost. + InstructionCost FMulCost = + TTI->getInstructionCost(I, CostKind); + LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + ScalarCost += FMACost; + continue; + } + } ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); continue; } @@ -24642,8 +24676,45 @@ private: auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); VectorType *RVecTy = getWidenedType(RType, ReduxWidth); - VectorCost += - TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); + InstructionCost FMACost = InstructionCost::getInvalid(); + if (RdxKind == RecurKind::FAdd) { + // Check if the reduction operands can be converted to FMA. + SmallVector<Value *> Ops; + FastMathFlags FMF; + FMF.set(); + for (Value *RdxVal : ReducedVals) { + if (!RdxVal->hasOneUse()) { + Ops.clear(); + break; + } + if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal)) + FMF &= FPCI->getFastMathFlags(); + Ops.push_back(RdxVal->user_back()); + } + if (!Ops.empty()) { + FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL, + *TTI, TLI); + if (FMACost.isValid()) { + // Calculate actual FMAD cost. + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy, + {RVecTy, RVecTy, RVecTy}, FMF); + FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind); + + LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n"); + // Also, exclude vector fmul cost. + InstructionCost FMulCost = TTI->getArithmeticInstrCost( + Instruction::FMul, RVecTy, CostKind); + LLVM_DEBUG(dbgs() + << "Minus vector FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + } + } + if (FMACost.isValid()) + VectorCost += FMACost; + else + VectorCost += + TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); if (RType != RedTy) { unsigned Opcode = Instruction::Trunc; if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits()) @@ -25311,7 +25382,7 @@ bool SLPVectorizerPass::vectorizeHorReduction( HorizontalReduction HorRdx; if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) return nullptr; - return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); + return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT); }; auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { if (TryOperandsAsNewSeeds && FutureSeed == Root) { @@ -25456,7 +25527,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (RedCost >= ScalarCost) return false; - return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr; + return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr; }; if (Candidates.size() == 1) return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R); @@ -25540,7 +25611,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, template <typename T> static bool tryToVectorizeSequence( SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator, - function_ref<bool(T *, T *)> AreCompatible, + function_ref<bool(ArrayRef<T *>, T *)> AreCompatible, function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R) { bool Changed = false; @@ -25562,7 +25633,7 @@ static bool tryToVectorizeSequence( auto *SameTypeIt = IncIt; while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) || R.isDeleted(cast<Instruction>(*SameTypeIt)) || - AreCompatible(*SameTypeIt, *IncIt))) { + AreCompatible(VL, *SameTypeIt))) { auto *I = dyn_cast<Instruction>(*SameTypeIt); ++SameTypeIt; if (I && !R.isDeleted(I)) @@ -25760,10 +25831,10 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts, return compareCmp<false>(V, V2, *TLI, *DT); }; - auto AreCompatibleCompares = [&](Value *V1, Value *V2) { - if (V1 == V2) + auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) { + if (VL.empty() || VL.back() == V1) return true; - return compareCmp<true>(V1, V2, *TLI, *DT); + return compareCmp<true>(V1, VL.back(), *TLI, *DT); }; SmallVector<Value *> Vals; @@ -25969,9 +26040,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } return false; }; - auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) { - if (V1 == V2) + auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL, + Value *V1) { + if (VL.empty() || V1 == VL.back()) return true; + Value *V2 = VL.back(); if (V1->getType() != V2->getType()) return false; ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; @@ -26061,7 +26134,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { InstSetVector PostProcessInserts; SmallSetVector<CmpInst *, 8> PostProcessCmps; - // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true + // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true // also vectorizes `PostProcessCmps`. auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) { bool Changed = vectorizeInserts(PostProcessInserts, BB, R); @@ -26342,7 +26415,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { V2->getValueOperand()->getValueID(); }; - auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { + bool SameParent = true; + auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) { + if (VL.empty()) { + SameParent = true; + return true; + } + StoreInst *V2 = VL.back(); if (V1 == V2) return true; if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType()) @@ -26353,15 +26432,34 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (isa<UndefValue>(V1->getValueOperand()) || isa<UndefValue>(V2->getValueOperand())) return true; - if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand())) - if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { - if (I1->getParent() != I2->getParent()) - return false; - return getSameOpcode({I1, I2}, *TLI).valid(); - } if (isa<Constant>(V1->getValueOperand()) && isa<Constant>(V2->getValueOperand())) return true; + // Check if the operands of the stores can be vectorized. They can be + // vectorized, if they have compatible operands or have operands, which can + // be vectorized as copyables. + auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()); + auto *I2 = dyn_cast<Instruction>(V2->getValueOperand()); + if (I1 || I2) { + // Accept only tail-following non-compatible values for now. + // TODO: investigate if it is possible to vectorize incompatible values, + // if the copyables are first in the list. + if (I1 && !I2) + return false; + SameParent &= I1 && I2 && I1->getParent() == I2->getParent(); + SmallVector<Value *> NewVL(VL.size() + 1); + for (auto [SI, V] : zip(VL, NewVL)) + V = SI->getValueOperand(); + NewVL.back() = V1->getValueOperand(); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true, + /*SkipSameCodeCheck=*/!SameParent); + if (S) + return true; + if (!SameParent) + return false; + } return V1->getValueOperand()->getValueID() == V2->getValueOperand()->getValueID(); }; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f972efa07eb7..16b1b539345d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -45,6 +45,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <cassert> #include <string> @@ -55,6 +56,15 @@ namespace llvm { extern cl::opt<bool> EnableVPlanNativePath; } +/// @{ +/// Metadata attribute names +const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; +const char LLVMLoopVectorizeFollowupVectorized[] = + "llvm.loop.vectorize.followup_vectorized"; +const char LLVMLoopVectorizeFollowupEpilogue[] = + "llvm.loop.vectorize.followup_epilogue"; +/// @} + extern cl::opt<unsigned> ForceTargetInstructionCost; static cl::opt<bool> PrintVPlansInDotFormat( @@ -143,7 +153,7 @@ template <typename T> static T *getPlanEntry(T *Start) { for (unsigned i = 0; i < WorkList.size(); i++) { T *Current = WorkList[i]; - if (Current->getNumPredecessors() == 0) + if (!Current->hasPredecessors()) return Current; auto &Predecessors = Current->getPredecessors(); WorkList.insert_range(Predecessors); @@ -216,7 +226,7 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB, // If VPBB is in a region R, VPBB is a loop header if R is a loop region with // VPBB as its entry, i.e., free of predecessors. if (auto *R = VPBB->getParent()) - return !R->isReplicator() && VPBB->getNumPredecessors() == 0; + return !R->isReplicator() && !VPBB->hasPredecessors(); // A header dominates its second predecessor (the latch), with the other // predecessor being the preheader @@ -493,6 +503,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { void VPIRBasicBlock::execute(VPTransformState *State) { assert(getHierarchicalSuccessors().size() <= 2 && "VPIRBasicBlock can have at most two successors at the moment!"); + // Move completely disconnected blocks to their final position. + if (IRBB->hasNPredecessors(0) && succ_begin(IRBB) == succ_end(IRBB)) + IRBB->moveAfter(State->CFG.PrevBB); State->Builder.SetInsertPoint(IRBB->getTerminator()); State->CFG.PrevBB = IRBB; State->CFG.VPBB2IRBB[this] = IRBB; @@ -809,7 +822,7 @@ InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) { const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const { const VPBlockBase *Pred = nullptr; - if (getNumPredecessors() > 0) { + if (hasPredecessors()) { Pred = getPredecessors()[Idx]; } else { auto *Region = getParent(); @@ -1183,14 +1196,14 @@ VPlan *VPlan::duplicate() { BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock(); VPIRBasicBlock *NewScalarHeader = nullptr; - if (getScalarHeader()->getNumPredecessors() == 0) { - NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB); - } else { + if (getScalarHeader()->hasPredecessors()) { NewScalarHeader = cast<VPIRBasicBlock>(*find_if( vp_depth_first_shallow(NewEntry), [ScalarHeaderIRBB](VPBlockBase *VPB) { auto *VPIRBB = dyn_cast<VPIRBasicBlock>(VPB); return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB; })); + } else { + NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB); } // Create VPlan, clone live-ins and remap operands in the cloned blocks. auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader); @@ -1473,7 +1486,7 @@ void VPSlotTracker::assignName(const VPValue *V) { std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str(); // First assign the base name for V. - const auto &[A, _] = VPValue2Name.insert({V, BaseName}); + const auto &[A, _] = VPValue2Name.try_emplace(V, BaseName); // Integer or FP constants with different types will result in he same string // due to stripping types. if (V->isLiveIn() && isa<ConstantInt, ConstantFP>(UV)) @@ -1481,7 +1494,7 @@ void VPSlotTracker::assignName(const VPValue *V) { // If it is already used by C > 0 other VPValues, increase the version counter // C and use it for V. - const auto &[C, UseInserted] = BaseName2Version.insert({BaseName, 0}); + const auto &[C, UseInserted] = BaseName2Version.try_emplace(BaseName, 0); if (!UseInserted) { C->second++; A->second = (BaseName + Twine(".") + Twine(C->second)).str(); @@ -1612,6 +1625,123 @@ VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const { llvm_unreachable("No plan found!"); } +static void addRuntimeUnrollDisableMetaData(Loop *L) { + SmallVector<Metadata *, 4> MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + bool IsUnrollMetadata = false; + MDNode *LoopID = L->getLoopID(); + if (LoopID) { + // First find existing loop unrolling disable metadata. + for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { + auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); + if (MD) { + const auto *S = dyn_cast<MDString>(MD->getOperand(0)); + if (!S) + continue; + if (S->getString().starts_with("llvm.loop.unroll.runtime.disable")) + continue; + IsUnrollMetadata = + S->getString().starts_with("llvm.loop.unroll.disable"); + } + MDs.push_back(LoopID->getOperand(I)); + } + } + + if (!IsUnrollMetadata) { + // Add runtime unroll disable metadata. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + } +} + +void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo( + Loop *VectorLoop, VPBasicBlock *HeaderVPBB, bool VectorizingEpilogue, + unsigned EstimatedVFxUF, bool DisableRuntimeUnroll) { + MDNode *LID = OrigLoop->getLoopID(); + // Update the metadata of the scalar loop. Skip the update when vectorizing + // the epilogue loop, to ensure it is only updated once. + if (!VectorizingEpilogue) { + std::optional<MDNode *> RemainderLoopID = makeFollowupLoopID( + LID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); + if (RemainderLoopID) { + OrigLoop->setLoopID(*RemainderLoopID); + } else { + if (DisableRuntimeUnroll) + addRuntimeUnrollDisableMetaData(OrigLoop); + + LoopVectorizeHints Hints(OrigLoop, true, *ORE); + Hints.setAlreadyVectorized(); + } + } + + if (!VectorLoop) + return; + + if (std::optional<MDNode *> VectorizedLoopID = + makeFollowupLoopID(LID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized})) { + VectorLoop->setLoopID(*VectorizedLoopID); + } else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (LID) + VectorLoop->setLoopID(LID); + + if (!VectorizingEpilogue) { + LoopVectorizeHints Hints(VectorLoop, true, *ORE); + Hints.setAlreadyVectorized(); + } + + // Check if it's EVL-vectorized and mark the corresponding metadata. + bool IsEVLVectorized = + llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) { + // Looking for the ExplictVectorLength VPInstruction. + if (const auto *VI = dyn_cast<VPInstruction>(&Recipe)) + return VI->getOpcode() == VPInstruction::ExplicitVectorLength; + return false; + }); + if (IsEVLVectorized) { + LLVMContext &Context = VectorLoop->getHeader()->getContext(); + MDNode *LoopID = VectorLoop->getLoopID(); + auto *IsEVLVectorizedMD = MDNode::get( + Context, + {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"), + MDString::get(Context, "evl")}); + MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {}, + {IsEVLVectorizedMD}); + VectorLoop->setLoopID(NewLoopID); + } + } + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(VectorLoop, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) + addRuntimeUnrollDisableMetaData(VectorLoop); + + // Set/update profile weights for the vector and remainder loops as original + // loop iterations are now distributed among them. Note that original loop + // becomes the scalar remainder loop after vectorization. + // + // For cases like foldTailByMasking() and requiresScalarEpiloque() we may + // end up getting slightly roughened result but that should be OK since + // profile is not inherently precise anyway. Note also possible bypass of + // vector code caused by legality checks is ignored, assigning all the weight + // to the vector loop, optimistically. + // + // For scalable vectorization we can't know at compile time how many + // iterations of the loop are handled in one vector iteration, so instead + // use the value of vscale used for tuning. + setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LoopVectorizationPlanner::printPlans(raw_ostream &O) { if (VPlans.empty()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index d6bc462a0dfa..53291a931530 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -219,6 +219,9 @@ public: size_t getNumSuccessors() const { return Successors.size(); } size_t getNumPredecessors() const { return Predecessors.size(); } + /// Returns true if this block has any predecessors. + bool hasPredecessors() const { return !Predecessors.empty(); } + /// An Enclosing Block of a block B is any block containing B, including B /// itself. \return the closest enclosing block starting from "this", which /// has successors. \return the root enclosing block if all enclosing blocks @@ -400,7 +403,7 @@ class LLVM_ABI_FOR_TEST VPRecipeBase public: VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPDef(SC), VPUser(Operands), DL(DL) {} virtual ~VPRecipeBase() = default; @@ -518,11 +521,11 @@ protected: class VPSingleDefRecipe : public VPRecipeBase, public VPValue { public: VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeBase(SC, Operands, DL), VPValue(this) {} VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands, - Value *UV, DebugLoc DL = {}) + Value *UV, DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {} static inline bool classof(const VPRecipeBase *R) { @@ -557,6 +560,7 @@ public: case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: + case VPRecipeBase::VPInterleaveEVLSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: case VPRecipeBase::VPWidenLoadEVLSC: @@ -712,12 +716,15 @@ public: VPIRFlags(GEPNoWrapFlags GEPFlags) : OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {} -public: void transferFlags(VPIRFlags &Other) { OpType = Other.OpType; AllFlags = Other.AllFlags; } + /// Only keep flags also present in \p Other. \p Other must have the same + /// OpType as the current object. + void intersectFlags(const VPIRFlags &Other); + /// Drop all poison-generating flags. void dropPoisonGeneratingFlags() { // NOTE: This needs to be kept in-sync with @@ -864,7 +871,7 @@ public: /// using IR flags. struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags() {} VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands, @@ -872,7 +879,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()), VPIRFlags(I) {} VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = {}) + const VPIRFlags &Flags, + DebugLoc DL = DebugLoc::getUnknown()) : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags(Flags) {} static inline bool classof(const VPRecipeBase *R) { @@ -900,6 +908,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { return R && classof(R); } + static inline bool classof(const VPSingleDefRecipe *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && classof(R); + } + void execute(VPTransformState &State) override = 0; /// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx. @@ -975,6 +988,10 @@ public: Not, SLPLoad, SLPStore, + // Creates a mask where each lane is active (true) whilst the current + // counter (first operand + index) is less than the second operand. i.e. + // mask[i] = icmpt ult (op0 + i), op1 + // The size of the mask returned is VF * Multiplier (UF, third op). ActiveLaneMask, ExplicitVectorLength, CalculateTripCountMinusVF, @@ -1014,7 +1031,8 @@ public: // Returns a scalar boolean value, which is true if any lane of its // (boolean) vector operands is true. It produces the reduced value across // all unrolled iterations. Unrolling will add all copies of its original - // operand as additional operands. + // operand as additional operands. AnyOf is poison-safe as all operands + // will be frozen. AnyOf, // Calculates the first active lane index of the vector predicate operands. // It produces the lane index across all unrolled iterations. Unrolling will @@ -1080,13 +1098,13 @@ private: #endif public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {}, - const Twine &Name = "") + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = {}, + const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = ""); VP_CLASSOF_IMPL(VPDef::VPInstructionSC) @@ -1479,7 +1497,8 @@ public: } VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, - const VPIRFlags &Flags = {}, DebugLoc DL = {}) + const VPIRFlags &Flags = {}, + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, Flags, DL), VPIRMetadata(), Opcode(Opcode), ResultTy(ResultTy) { assert(flagsValidForOpcode(Opcode) && @@ -1537,7 +1556,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { public: VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID, ArrayRef<VPValue *> CallArguments, Type *Ty, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI), VPIRMetadata(CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), MayReadFromMemory(CI.mayReadFromMemory()), @@ -1546,7 +1565,7 @@ public: VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, ArrayRef<VPValue *> CallArguments, Type *Ty, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL), VPIRMetadata(), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) { LLVMContext &Ctx = Ty->getContext(); @@ -1615,7 +1634,8 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags, public: VPWidenCallRecipe(Value *UV, Function *Variant, - ArrayRef<VPValue *> CallArguments, DebugLoc DL = {}) + ArrayRef<VPValue *> CallArguments, + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments, *cast<Instruction>(UV)), VPIRMetadata(*cast<Instruction>(UV)), Variant(Variant) { @@ -1644,10 +1664,8 @@ public: return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); } - operand_range args() { return make_range(op_begin(), std::prev(op_end())); } - const_operand_range args() const { - return make_range(op_begin(), std::prev(op_end())); - } + operand_range args() { return drop_end(operands()); } + const_operand_range args() const { return drop_end(operands()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -1667,7 +1685,7 @@ class VPHistogramRecipe : public VPRecipeBase { public: VPHistogramRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {} ~VPHistogramRecipe() override = default; @@ -1998,6 +2016,9 @@ public: return getOperand(1); } + /// Update the incoming value from the loop backedge. + void setBackedgeValue(VPValue *V) { setOperand(1, V); } + /// Returns the backedge value as a recipe. The backedge value is guaranteed /// to be a recipe. virtual VPRecipeBase &getBackedgeRecipe() { @@ -2229,8 +2250,8 @@ protected: public: /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and /// debug location \p DL. - VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = {}, - const Twine &Name = "") + VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL), Name(Name.str()) { if (Start) @@ -2381,9 +2402,8 @@ public: } VPBlendRecipe *clone() override { - SmallVector<VPValue *> Ops(operands()); - return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), Ops, - getDebugLoc()); + return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), + operands(), getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPBlendSC) @@ -2409,6 +2429,12 @@ public: return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized()); } + /// Set mask number \p Idx to \p V. + void setMask(unsigned Idx, VPValue *V) { + assert((Idx > 0 || !isNormalized()) && "First index has no mask!"); + Idx == 0 ? setOperand(1, V) : setOperand(Idx * 2 + !isNormalized(), V); + } + void execute(VPTransformState &State) override { llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends"); } @@ -2434,12 +2460,13 @@ public: } }; -/// VPInterleaveRecipe is a recipe for transforming an interleave group of load -/// or stores into one wide load/store and shuffles. The first operand of a -/// VPInterleave recipe is the address, followed by the stored values, followed -/// by an optional mask. -class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase, - public VPIRMetadata { +/// A common base class for interleaved memory operations. +/// An Interleaved memory operation is a memory access method that combines +/// multiple strided loads/stores into a single wide load/store with shuffles. +/// The first operand is the start address. The optional operands are, in order, +/// the stored values and the mask. +class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase, + public VPIRMetadata { const InterleaveGroup<Instruction> *IG; /// Indicates if the interleave group is in a conditional block and requires a @@ -2450,12 +2477,14 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase, /// unusued gaps can be loaded speculatively. bool NeedsMaskForGaps = false; -public: - VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, - ArrayRef<VPValue *> StoredValues, VPValue *Mask, - bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL) - : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, DL), VPIRMetadata(MD), - IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) { +protected: + VPInterleaveBase(const unsigned char SC, + const InterleaveGroup<Instruction> *IG, + ArrayRef<VPValue *> Operands, + ArrayRef<VPValue *> StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL) + : VPRecipeBase(SC, Operands, DL), VPIRMetadata(MD), IG(IG), + NeedsMaskForGaps(NeedsMaskForGaps) { // TODO: extend the masked interleaved-group support to reversed access. assert((!Mask || !IG->isReverse()) && "Reversed masked interleave-group not supported."); @@ -2473,14 +2502,19 @@ public: addOperand(Mask); } } - ~VPInterleaveRecipe() override = default; - VPInterleaveRecipe *clone() override { - return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), - NeedsMaskForGaps, *this, getDebugLoc()); +public: + VPInterleaveBase *clone() override = 0; + + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPInterleaveSC || + R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC; } - VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && classof(R); + } /// Return the address accessed by this recipe. VPValue *getAddr() const { @@ -2490,48 +2524,130 @@ public: /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is optional and therefore the last, currently 2nd operand. + // Mask is optional and the last operand. return HasMask ? getOperand(getNumOperands() - 1) : nullptr; } + /// Return true if the access needs a mask because of the gaps. + bool needsMaskForGaps() const { return NeedsMaskForGaps; } + + const InterleaveGroup<Instruction> *getInterleaveGroup() const { return IG; } + + Instruction *getInsertPos() const { return IG->getInsertPos(); } + + void execute(VPTransformState &State) override { + llvm_unreachable("VPInterleaveBase should not be instantiated."); + } + + /// Return the cost of this recipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + virtual bool onlyFirstLaneUsed(const VPValue *Op) const override = 0; + + /// Returns the number of stored operands of this interleave group. Returns 0 + /// for load interleave groups. + virtual unsigned getNumStoreOperands() const = 0; + /// Return the VPValues stored by this interleave group. If it is a load /// interleave group, return an empty ArrayRef. ArrayRef<VPValue *> getStoredValues() const { - // The first operand is the address, followed by the stored values, followed - // by an optional mask. - return ArrayRef<VPValue *>(op_begin(), getNumOperands()) - .slice(1, getNumStoreOperands()); + return ArrayRef<VPValue *>(op_end() - + (getNumStoreOperands() + (HasMask ? 1 : 0)), + getNumStoreOperands()); + } +}; + +/// VPInterleaveRecipe is a recipe for transforming an interleave group of load +/// or stores into one wide load/store and shuffles. The first operand of a +/// VPInterleave recipe is the address, followed by the stored values, followed +/// by an optional mask. +class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase { +public: + VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, + ArrayRef<VPValue *> StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL) + : VPInterleaveBase(VPDef::VPInterleaveSC, IG, Addr, StoredValues, Mask, + NeedsMaskForGaps, MD, DL) {} + + ~VPInterleaveRecipe() override = default; + + VPInterleaveRecipe *clone() override { + return new VPInterleaveRecipe(getInterleaveGroup(), getAddr(), + getStoredValues(), getMask(), + needsMaskForGaps(), *this, getDebugLoc()); } + VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; - /// Return the cost of this VPInterleaveRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif - const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + } - /// Returns the number of stored operands of this interleave group. Returns 0 - /// for load interleave groups. - unsigned getNumStoreOperands() const { - return getNumOperands() - (HasMask ? 2 : 1); + unsigned getNumStoreOperands() const override { + return getNumOperands() - (getMask() ? 2 : 1); } +}; + +/// A recipe for interleaved memory operations with vector-predication +/// intrinsics. The first operand is the address, the second operand is the +/// explicit vector length. Stored values and mask are optional operands. +class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase { +public: + VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask) + : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(), + ArrayRef<VPValue *>({R.getAddr(), &EVL}), + R.getStoredValues(), Mask, R.needsMaskForGaps(), R, + R.getDebugLoc()) { + assert(!getInterleaveGroup()->isReverse() && + "Reversed interleave-group with tail folding is not supported."); + assert(!needsMaskForGaps() && "Interleaved access with gap mask is not " + "supported for scalable vector."); + } + + ~VPInterleaveEVLRecipe() override = default; + + VPInterleaveEVLRecipe *clone() override { + llvm_unreachable("cloning not implemented yet"); + } + + VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC) + + /// The VPValue of the explicit vector length. + VPValue *getEVL() const { return getOperand(1); } - /// The recipe only uses the first lane of the address. + /// Generate the wide load or store, and shuffles. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// The recipe only uses the first lane of the address, and EVL operand. bool onlyFirstLaneUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) || + Op == getEVL(); } - Instruction *getInsertPos() const { return IG->getInsertPos(); } + unsigned getNumStoreOperands() const override { + return getNumOperands() - (getMask() ? 3 : 2); + } }; /// A recipe to represent inloop reduction operations, performing a reduction on @@ -2561,14 +2677,14 @@ protected: public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} @@ -2686,7 +2802,7 @@ public: class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe { public: VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe( VPDef::VPReductionEVLSC, R.getRecurrenceKind(), R.getFastMathFlags(), @@ -3537,7 +3653,8 @@ public: InductionOpcode(Opcode) {} VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV, - VPValue *Step, VPValue *VF, DebugLoc DL = {}) + VPValue *Step, VPValue *VF, + DebugLoc DL = DebugLoc::getUnknown()) : VPScalarIVStepsRecipe( IV, Step, VF, IndDesc.getInductionOpcode(), dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()) @@ -4142,7 +4259,7 @@ public: /// Returns an iterator range over all VFs of the plan. iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() const { - return {VFs.begin(), VFs.end()}; + return VFs; } bool hasScalarVFOnly() const { @@ -4299,9 +4416,8 @@ public: /// via the other early exit). bool hasEarlyExit() const { return count_if(ExitBlocks, - [](VPIRBasicBlock *EB) { - return EB->getNumPredecessors() != 0; - }) > 1 || + [](VPIRBasicBlock *EB) { return EB->hasPredecessors(); }) > + 1 || (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1); } @@ -4309,7 +4425,7 @@ public: /// that this relies on unneeded branches to the scalar tail loop being /// removed. bool hasScalarTail() const { - return !(getScalarPreheader()->getNumPredecessors() == 0 || + return !(!getScalarPreheader()->hasPredecessors() || getScalarPreheader()->getSinglePredecessor() == getEntry()); } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 747c6623aa22..d400ceff7797 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -296,7 +296,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) - .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) { + .Case<VPInterleaveBase>([V](const auto *R) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 80b48de57b40..cef91c15dd87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -193,6 +193,9 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, } if (auto *SI = dyn_cast<SwitchInst>(Inst)) { + // Don't emit recipes for unconditional switch instructions. + if (SI->getNumCases() == 0) + continue; SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())}; for (auto Case : SI->cases()) Ops.push_back(getOrCreateVPOperand(Case.getCaseValue())); @@ -538,8 +541,7 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, } void VPlanTransforms::handleEarlyExits(VPlan &Plan, - bool HasUncountableEarlyExit, - VFRange &Range) { + bool HasUncountableEarlyExit) { auto *MiddleVPBB = cast<VPBasicBlock>( Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]); auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor()); @@ -559,8 +561,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan, assert(!HandledUncountableEarlyExit && "can handle exactly one uncountable early exit"); handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan, - cast<VPBasicBlock>(HeaderVPB), LatchVPBB, - Range); + cast<VPBasicBlock>(HeaderVPB), LatchVPBB); HandledUncountableEarlyExit = true; } else { for (VPRecipeBase &R : EB->phis()) @@ -671,6 +672,90 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, } } +void VPlanTransforms::addMinimumIterationCheck( + VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, + bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, + const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) { + // Generate code to check if the loop's trip count is less than VF * UF, or + // equal to it in case a scalar epilogue is required; this implies that the + // vector trip count is zero. This check also covers the case where adding one + // to the backedge-taken count overflowed leading to an incorrect trip count + // of zero. In this case we will also jump to the scalar loop. + CmpInst::Predicate CmpPred = + RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + // If tail is to be folded, vector loop takes care of all iterations. + VPValue *TripCountVPV = Plan.getTripCount(); + const SCEV *TripCount = vputils::getSCEVExprForVPValue(TripCountVPV, SE); + Type *TripCountTy = TripCount->getType(); + auto GetMinTripCount = [&]() -> const SCEV * { + // Compute max(MinProfitableTripCount, UF * VF) and return it. + const SCEV *VFxUF = + SE.getElementCount(TripCountTy, (VF * UF), SCEV::FlagNUW); + if (UF * VF.getKnownMinValue() >= + MinProfitableTripCount.getKnownMinValue()) { + // TODO: SCEV should be able to simplify test. + return VFxUF; + } + const SCEV *MinProfitableTripCountSCEV = + SE.getElementCount(TripCountTy, MinProfitableTripCount, SCEV::FlagNUW); + return SE.getUMaxExpr(MinProfitableTripCountSCEV, VFxUF); + }; + + VPBasicBlock *EntryVPBB = Plan.getEntry(); + VPBuilder Builder(EntryVPBB); + VPValue *TripCountCheck = Plan.getFalse(); + const SCEV *Step = GetMinTripCount(); + if (TailFolded) { + if (CheckNeededWithTailFolding) { + // vscale is not necessarily a power-of-2, which means we cannot guarantee + // an overflow to zero when updating induction variables and so an + // additional overflow check is required before entering the vector loop. + + // Get the maximum unsigned value for the type. + VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get( + TripCountTy, cast<IntegerType>(TripCountTy)->getMask())); + VPValue *DistanceToMax = Builder.createNaryOp( + Instruction::Sub, {MaxUIntTripCount, TripCountVPV}, + DebugLoc::getUnknown()); + + // Don't execute the vector loop if (UMax - n) < (VF * UF). + // FIXME: Should only check VF * UF, but currently checks Step=max(VF*UF, + // minProfitableTripCount). + TripCountCheck = Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax, + Builder.createExpandSCEV(Step), DL); + } else { + // TripCountCheck = false, folding tail implies positive vector trip + // count. + } + } else { + // TODO: Emit unconditional branch to vector preheader instead of + // conditional branch with known condition. + TripCount = SE.applyLoopGuards(TripCount, OrigLoop); + // Check if the trip count is < the step. + if (SE.isKnownPredicate(CmpPred, TripCount, Step)) { + // TODO: Ensure step is at most the trip count when determining max VF and + // UF, w/o tail folding. + TripCountCheck = Plan.getTrue(); + } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(CmpPred), + TripCount, Step)) { + // Generate the minimum iteration check only if we cannot prove the + // check is known to be true, or known to be false. + VPValue *MinTripCountVPV = Builder.createExpandSCEV(Step); + TripCountCheck = Builder.createICmp( + CmpPred, TripCountVPV, MinTripCountVPV, DL, "min.iters.check"); + } // else step known to be < trip count, use TripCountCheck preset to false. + } + VPInstruction *Term = + Builder.createNaryOp(VPInstruction::BranchOnCond, {TripCountCheck}, DL); + if (MinItersBypassWeights) { + MDBuilder MDB(Plan.getContext()); + MDNode *BranchWeights = MDB.createBranchWeights( + ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false); + Term->addMetadata(LLVMContext::MD_prof, BranchWeights); + } +} + bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>( diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 1ec6ae677374..109156c1469c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -145,6 +145,16 @@ inline int_pred_ty<is_all_ones> m_AllOnes() { return int_pred_ty<is_all_ones>(); } +struct is_zero_int { + bool isValue(const APInt &C) const { return C.isZero(); } +}; + +/// Match an integer 0 or a vector with all elements equal to 0. +/// For vectors, this includes constants with undefined elements. +inline int_pred_ty<is_zero_int> m_ZeroInt() { + return int_pred_ty<is_zero_int>(); +} + /// Matching combinators template <typename LTy, typename RTy> struct match_combine_or { LTy L; @@ -218,9 +228,12 @@ struct Recipe_match { if ((!matchRecipeAndOpcode<RecipeTys>(R) && ...)) return false; - assert(R->getNumOperands() == std::tuple_size<Ops_t>::value && - "recipe with matched opcode does not have the expected number of " - "operands"); + if (R->getNumOperands() != std::tuple_size<Ops_t>::value) { + assert(Opcode == Instruction::PHI && + "non-variadic recipe with matched opcode does not have the " + "expected number of operands"); + return false; + } auto IdxSeq = std::make_index_sequence<std::tuple_size<Ops_t>::value>(); if (all_of_tuple_elements(IdxSeq, [R](auto Op, unsigned Idx) { @@ -302,14 +315,21 @@ m_Broadcast(const Op0_t &Op0) { } template <typename Op0_t> +inline VPInstruction_match<VPInstruction::ExplicitVectorLength, Op0_t> +m_EVL(const Op0_t &Op0) { + return m_VPInstruction<VPInstruction::ExplicitVectorLength>(Op0); +} + +template <typename Op0_t> inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t> m_ExtractLastElement(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0); } -template <typename Op0_t, typename Op1_t> -inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t> -m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { - return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1); + +template <typename Op0_t, typename Op1_t, typename Op2_t> +inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t> +m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2); } template <typename Op0_t, typename Op1_t> @@ -345,6 +365,12 @@ m_ZExtOrSExt(const Op0_t &Op0) { return m_CombineOr(m_ZExt(Op0), m_SExt(Op0)); } +template <typename Op0_t> +inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>, Op0_t> +m_ZExtOrSelf(const Op0_t &Op0) { + return m_CombineOr(m_ZExt(Op0), Op0); +} + template <unsigned Opcode, typename Op0_t, typename Op1_t> inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0, const Op1_t &Op1) { @@ -381,6 +407,13 @@ m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) { return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1); } +/// Match a binary AND operation. +template <typename Op0_t, typename Op1_t> +inline AllRecipe_commutative_match<Instruction::And, Op0_t, Op1_t> +m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1) { + return m_c_Binary<Instruction::And, Op0_t, Op1_t>(Op0, Op1); +} + /// Match a binary OR operation. Note that while conceptually the operands can /// be matched commutatively, \p Commutative defaults to false in line with the /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index cdadc33e3088..0c27d535b680 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -14,11 +14,13 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanCFG.h" +#include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" #include "llvm/ADT/PostOrderIterator.h" using namespace llvm; +using namespace VPlanPatternMatch; namespace { class VPPredicator { @@ -246,6 +248,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { "Distinct incoming values with one having a full mask"); break; } + OperandsWithMask.push_back(EdgeMask); } PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c4fdcccc6d62..bf5148954309 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -52,8 +52,9 @@ bool VPRecipeBase::mayWriteToMemory() const { return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); + case VPInterleaveEVLSC: case VPInterleaveSC: - return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; + return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: case VPWidenStoreSC: return true; @@ -142,6 +143,7 @@ bool VPRecipeBase::mayReadFromMemory() const { return false; } default: + // FIXME: Return false if the recipe represents an interleaved store. return true; } } @@ -183,6 +185,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { "underlying instruction has side-effects"); return false; } + case VPInterleaveEVLSC: case VPInterleaveSC: return mayWriteToMemory(); case VPWidenLoadEVLSC: @@ -255,7 +258,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { Instruction *UI = nullptr; if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); - else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) + else if (auto *IG = dyn_cast<VPInterleaveBase>(this)) UI = IG->getInsertPos(); else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) UI = &WidenMem->getIngredient(); @@ -389,6 +392,42 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPIRFlags::intersectFlags(const VPIRFlags &Other) { + assert(OpType == Other.OpType && "OpType must match"); + switch (OpType) { + case OperationType::OverflowingBinOp: + WrapFlags.HasNUW &= Other.WrapFlags.HasNUW; + WrapFlags.HasNSW &= Other.WrapFlags.HasNSW; + break; + case OperationType::Trunc: + TruncFlags.HasNUW &= Other.TruncFlags.HasNUW; + TruncFlags.HasNSW &= Other.TruncFlags.HasNSW; + break; + case OperationType::DisjointOp: + DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint; + break; + case OperationType::PossiblyExactOp: + ExactFlags.IsExact &= Other.ExactFlags.IsExact; + break; + case OperationType::GEPOp: + GEPFlags &= Other.GEPFlags; + break; + case OperationType::FPMathOp: + FMFs.NoNaNs &= Other.FMFs.NoNaNs; + FMFs.NoInfs &= Other.FMFs.NoInfs; + break; + case OperationType::NonNegOp: + NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg; + break; + case OperationType::Cmp: + assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate"); + break; + case OperationType::Other: + assert(AllFlags == Other.AllFlags && "Cannot drop other flags"); + break; + } +} + FastMathFlags VPIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); @@ -471,7 +510,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::ICmp: case Instruction::FCmp: case Instruction::Store: - case VPInstruction::ActiveLaneMask: case VPInstruction::BranchOnCount: case VPInstruction::ComputeReductionResult: case VPInstruction::FirstOrderRecurrenceSplice: @@ -481,6 +519,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::WideIVStep: return 2; case Instruction::Select: + case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: return 3; @@ -620,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) { Name); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); + auto PredTy = VectorType::get( + Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue()) + ->getZExtValue()); return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); @@ -875,9 +916,9 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); } case VPInstruction::AnyOf: { - Value *Res = State.get(getOperand(0)); + Value *Res = Builder.CreateFreeze(State.get(getOperand(0))); for (VPValue *Op : drop_begin(operands())) - Res = Builder.CreateOr(Res, State.get(Op)); + Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op))); return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); } case VPInstruction::ExtractLane: { @@ -919,8 +960,15 @@ Value *VPInstruction::generate(VPTransformState &State) { unsigned LastOpIdx = getNumOperands() - 1; Value *Res = nullptr; for (int Idx = LastOpIdx; Idx >= 0; --Idx) { - Value *TrailingZeros = Builder.CreateCountTrailingZeroElems( - Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name); + Value *TrailingZeros = + State.VF.isScalar() + ? Builder.CreateZExt( + Builder.CreateICmpEQ(State.get(getOperand(Idx)), + Builder.getFalse()), + Builder.getInt64Ty()) + : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), + State.get(getOperand(Idx)), + true, Name); Value *Current = Builder.CreateAdd( Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros); if (Res) { @@ -1027,8 +1075,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } switch (getOpcode()) { + case Instruction::Select: { + // TODO: It may be possible to improve this by analyzing where the + // condition operand comes from. + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + auto *CondTy = Ctx.Types.inferScalarType(getOperand(0)); + auto *VecTy = Ctx.Types.inferScalarType(getOperand(1)); + if (!vputils::onlyFirstLaneUsed(this)) { + CondTy = toVectorTy(CondTy, VF); + VecTy = toVectorTy(VecTy, VF); + } + return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred, + Ctx.CostKind); + } case Instruction::ExtractElement: case VPInstruction::ExtractLane: { + if (VF.isScalar()) { + // ExtractLane with VF=1 takes care of handling extracting across multiple + // parts. + return 0; + } + // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, @@ -1040,8 +1107,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind); } case VPInstruction::FirstActiveLane: { + Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0)); + if (VF.isScalar()) + return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy), + CmpInst::ICMP_EQ, Ctx.CostKind); // Calculate the cost of determining the lane index. - auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + auto *PredTy = toVectorTy(ScalarTy, VF); IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx), {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); @@ -1060,7 +1132,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } case VPInstruction::ActiveLaneMask: { Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); - Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); + unsigned Multiplier = + cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue(); + Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier); IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, {ArgTy, ArgTy}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); @@ -1684,18 +1758,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { State.set(this, V); } -InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R. +static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, + ArrayRef<const VPValue *> Operands, + const VPRecipeWithIRFlags &R, + ElementCount VF, + VPCostContext &Ctx) { // Some backends analyze intrinsic arguments to determine cost. Use the // underlying value for the operand if it has one. Otherwise try to use the // operand of the underlying call instruction, if there is one. Otherwise // clear Arguments. // TODO: Rework TTI interface to be independent of concrete IR values. SmallVector<const Value *> Arguments; - for (const auto &[Idx, Op] : enumerate(operands())) { + for (const auto &[Idx, Op] : enumerate(Operands)) { auto *V = Op->getUnderlyingValue(); if (!V) { - if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { + if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) { Arguments.push_back(UI->getArgOperand(Idx)); continue; } @@ -1705,21 +1783,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, Arguments.push_back(V); } - Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF); + Type *ScalarRetTy = Ctx.Types.inferScalarType(&R); + Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy; SmallVector<Type *> ParamTys; - for (unsigned I = 0; I != getNumOperands(); ++I) - ParamTys.push_back( - toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); + for (const VPValue *Op : Operands) { + ParamTys.push_back(VF.isVector() + ? toVectorTy(Ctx.Types.inferScalarType(Op), VF) + : Ctx.Types.inferScalarType(Op)); + } // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. - FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); + FastMathFlags FMF = + R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( - VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, - dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()), + ID, RetTy, Arguments, ParamTys, FMF, + dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()), InstructionCost::getInvalid(), &Ctx.TLI); return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); } +InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + SmallVector<const VPValue *> ArgOps(operands()); + return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx); +} + StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } @@ -2110,8 +2198,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: - // More complex computation, let the legacy cost-model handle this for now. - return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); + // If the div/rem operation isn't safe to speculate and requires + // predication, then the only way we can even create a vplan is to insert + // a select on the second input operand to ensure we use the value of 1 + // for the inactive lanes. The select will be costed separately. case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: @@ -2174,7 +2264,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { if (VF.isScalar()) return TTI::CastContextHint::Normal; - if (isa<VPInterleaveRecipe>(R)) + if (isa<VPInterleaveBase>(R)) return TTI::CastContextHint::Interleave; if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked @@ -2756,10 +2846,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); assert(RedTy->isIntegerTy() && "VPExpressionRecipe only supports integer types currently."); + unsigned Opcode = RecurrenceDescriptor::getOpcode( + cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind()); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { - unsigned Opcode = RecurrenceDescriptor::getOpcode( - cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind()); return Ctx.TTI.getExtendedReductionCost( Opcode, cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == @@ -2767,13 +2857,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); } case ExpressionTypes::MulAccReduction: - return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind); + return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, + Ctx.CostKind); case ExpressionTypes::ExtMulAccReduction: return Ctx.TTI.getMulAccReductionCost( cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, - RedTy, SrcVecTy, Ctx.CostKind); + Opcode, RedTy, SrcVecTy, Ctx.CostKind); } llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); } @@ -3014,23 +3105,75 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // instruction cost. return 0; case Instruction::Call: { - if (!isSingleScalar()) { - // TODO: Handle remaining call costs here as well. - if (VF.isScalable()) - return InstructionCost::getInvalid(); - break; - } - auto *CalledFn = cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); - if (CalledFn->isIntrinsic()) - break; + SmallVector<const VPValue *> ArgOps(drop_end(operands())); SmallVector<Type *, 4> Tys; - for (VPValue *ArgOp : drop_end(operands())) + for (const VPValue *ArgOp : ArgOps) Tys.push_back(Ctx.Types.inferScalarType(ArgOp)); + + if (CalledFn->isIntrinsic()) + // Various pseudo-intrinsics with costs of 0 are scalarized instead of + // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early. + switch (CalledFn->getIntrinsicID()) { + case Intrinsic::assume: + case Intrinsic::lifetime_end: + case Intrinsic::lifetime_start: + case Intrinsic::sideeffect: + case Intrinsic::pseudoprobe: + case Intrinsic::experimental_noalias_scope_decl: { + assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx) == 0 && + "scalarizing intrinsic should be free"); + return InstructionCost(0); + } + default: + break; + } + Type *ResultTy = Ctx.Types.inferScalarType(this); - return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + InstructionCost ScalarCallCost = + Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + if (isSingleScalar()) { + if (CalledFn->isIntrinsic()) + ScalarCallCost = std::min( + ScalarCallCost, + getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx)); + return ScalarCallCost; + } + + if (VF.isScalable()) + return InstructionCost::getInvalid(); + + // Compute the cost of scalarizing the result and operands if needed. + InstructionCost ScalarizationCost = 0; + if (VF.isVector()) { + if (!ResultTy->isVoidTy()) { + for (Type *VectorTy : + to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) { + ScalarizationCost += Ctx.TTI.getScalarizationOverhead( + cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()), + /*Insert=*/true, + /*Extract=*/false, Ctx.CostKind); + } + } + // Skip operands that do not require extraction/scalarization and do not + // incur any overhead. + SmallPtrSet<const VPValue *, 4> UniqueOperands; + Tys.clear(); + for (auto *Op : ArgOps) { + if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || + !UniqueOperands.insert(Op).second) + continue; + Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF)); + } + ScalarizationCost += + Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind); + } + + return ScalarCallCost * VF.getFixedValue() + ScalarizationCost; } case Instruction::Add: case Instruction::Sub: @@ -3045,10 +3188,29 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: return *getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx) * (isSingleScalar() ? 1 : VF.getFixedValue()); + case Instruction::Load: + case Instruction::Store: { + if (isSingleScalar()) { + bool IsLoad = UI->getOpcode() == Instruction::Load; + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); + return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); + } + // TODO: See getMemInstScalarizationCost for how to handle replicating and + // predicated cases. + break; } } @@ -3181,10 +3343,17 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, // TODO: Using the original IR may not be accurate. // Currently, ARM will use the underlying IR to calculate gather/scatter // instruction cost. - const Value *Ptr = getLoadStorePointerOperand(&Ingredient); - Type *PtrTy = toVectorTy(Ptr->getType(), VF); assert(!Reverse && "Inconsecutive memory access should not have the order."); + + const Value *Ptr = getLoadStorePointerOperand(&Ingredient); + Type *PtrTy = Ptr->getType(); + + // If the address value is uniform across all lanes, then the address can be + // calculated with scalar type and broadcast. + if (!vputils::isSingleScalar(getAddr())) + PtrTy = toVectorTy(PtrTy, VF); + return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, Ctx.CostKind) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, @@ -3532,9 +3701,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Interleave group being replicated."); - assert((!NeedsMaskForGaps || !State.VF.isScalable()) && + assert((!needsMaskForGaps() || !State.VF.isScalable()) && "Masking gaps for scalable vectors is not yet supported."); - const InterleaveGroup<Instruction> *Group = IG; + const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); Instruction *Instr = Group->getInsertPos(); // Prepare for the vector type of the interleaved load/store. @@ -3574,7 +3743,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Vectorize the interleaved load group. if (isa<LoadInst>(Instr)) { Value *MaskForGaps = nullptr; - if (NeedsMaskForGaps) { + if (needsMaskForGaps()) { MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); @@ -3651,7 +3820,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Vectorize the interleaved store group. Value *MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); - assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) && + assert(((MaskForGaps != nullptr) == needsMaskForGaps()) && "Mismatch between NeedsMaskForGaps and MaskForGaps"); ArrayRef<VPValue *> StoredValues = getStoredValues(); // Collect the stored vector from each member. @@ -3702,6 +3871,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { + const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; @@ -3730,8 +3900,152 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +void VPInterleaveEVLRecipe::execute(VPTransformState &State) { + assert(!State.Lane && "Interleave group being replicated."); + assert(State.VF.isScalable() && + "Only support scalable VF for EVL tail-folding."); + assert(!needsMaskForGaps() && + "Masking gaps for scalable vectors is not yet supported."); + const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); + Instruction *Instr = Group->getInsertPos(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = getLoadStoreType(Instr); + unsigned InterleaveFactor = Group->getFactor(); + assert(InterleaveFactor <= 8 && + "Unsupported deinterleave/interleave factor for scalable vectors"); + ElementCount WideVF = State.VF * InterleaveFactor; + auto *VecTy = VectorType::get(ScalarTy, WideVF); + + VPValue *Addr = getAddr(); + Value *ResAddr = State.get(Addr, VPLane(0)); + Value *EVL = State.get(getEVL(), VPLane(0)); + Value *InterleaveEVL = State.Builder.CreateMul( + EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl", + /* NUW= */ true, /* NSW= */ true); + LLVMContext &Ctx = State.Builder.getContext(); + + Value *GroupMask = nullptr; + if (VPValue *BlockInMask = getMask()) { + SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask)); + GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask"); + } else { + GroupMask = + State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue()); + } + + // Vectorize the interleaved load group. + if (isa<LoadInst>(Instr)) { + CallInst *NewLoad = State.Builder.CreateIntrinsic( + VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr, + "wide.vp.load"); + NewLoad->addParamAttr(0, + Attribute::getWithAlignment(Ctx, Group->getAlign())); + + applyMetadata(*NewLoad); + // TODO: Also manage existing metadata using VPIRMetadata. + Group->addMetadata(NewLoad); + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + NewLoad = State.Builder.CreateIntrinsic( + Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), + NewLoad->getType(), NewLoad, + /*FMFSource=*/nullptr, "strided.vec"); + + const DataLayout &DL = Instr->getDataLayout(); + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + // Skip the gaps in the group. + if (!Member) + continue; + + Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } + + State.set(getVPValue(J), StridedVec); + ++J; + } + return; + } // End for interleaved load. + + // The sub vector type for current instruction. + auto *SubVT = VectorType::get(ScalarTy, State.VF); + // Vectorize the interleaved store group. + ArrayRef<VPValue *> StoredValues = getStoredValues(); + // Collect the stored vector from each member. + SmallVector<Value *, 4> StoredVecs; + const DataLayout &DL = Instr->getDataLayout(); + for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) { + Instruction *Member = Group->getMember(I); + // Skip the gaps in the group. + if (!Member) { + StoredVecs.push_back(PoisonValue::get(SubVT)); + continue; + } + + Value *StoredVec = State.get(StoredValues[StoredIdx]); + // If this member has different type, cast it to a unified type. + if (StoredVec->getType() != SubVT) + StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); + + StoredVecs.push_back(StoredVec); + ++StoredIdx; + } + + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); + CallInst *NewStore = + State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store, + {IVec, ResAddr, GroupMask, InterleaveEVL}); + NewStore->addParamAttr(1, + Attribute::getWithAlignment(Ctx, Group->getAlign())); + + applyMetadata(*NewStore); + // TODO: Also manage existing metadata using VPIRMetadata. + Group->addMetadata(NewStore); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << ", "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", "; + getEVL()->printAsOperand(O, SlotTracker); + if (VPValue *Mask = getMask()) { + O << ", "; + Mask->printAsOperand(O, SlotTracker); + } + + unsigned OpIdx = 0; + for (unsigned i = 0; i < IG->getFactor(); ++i) { + if (!IG->getMember(i)) + continue; + if (getNumStoreOperands() > 0) { + O << "\n" << Indent << " vp.store "; + getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); + O << " to index " << i; + } else { + O << "\n" << Indent << " "; + getVPValue(OpIdx)->printAsOperand(O, SlotTracker); + O << " = vp.load from index " << i; + } + ++OpIdx; + } +} +#endif + +InstructionCost VPInterleaveBase::computeCost(ElementCount VF, + VPCostContext &Ctx) const { Instruction *InsertPos = getInsertPos(); // Find the VPValue index of the interleave group. We need to skip gaps. unsigned InsertPosIdx = 0; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e0bf241c73fd..2cac5557daee 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" @@ -39,6 +40,10 @@ using namespace llvm; using namespace VPlanPatternMatch; +cl::opt<bool> EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide get active lane mask instructions")); + bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPlanPtr &Plan, function_ref<const InductionDescriptor *(PHINode *)> @@ -142,7 +147,7 @@ static bool sinkScalarOperands(VPlan &Plan) { for (VPValue *Op : Recipe.operands()) if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert(std::make_pair(VPBB, Def)); + WorkList.insert({VPBB, Def}); } } @@ -206,7 +211,7 @@ static bool sinkScalarOperands(VPlan &Plan) { for (VPValue *Op : SinkCandidate->operands()) if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert(std::make_pair(SinkTo, Def)); + WorkList.insert({SinkTo, Def}); Changed = true; } return Changed; @@ -344,7 +349,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, auto *BlockInMask = PredRecipe->getMask(); auto *MaskDef = BlockInMask->getDefiningRecipe(); auto *BOMRecipe = new VPBranchOnMaskRecipe( - BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc()); + BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown()); auto *Entry = Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); @@ -859,8 +864,8 @@ static VPValue *optimizeLatchExitInductionUser( Type *StepTy = TypeInfo.inferScalarType(Step); auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0)); return B.createPtrAdd(EndValue, - B.createNaryOp(Instruction::Sub, {Zero, Step}), {}, - "ind.escape"); + B.createNaryOp(Instruction::Sub, {Zero, Step}), + DebugLoc::getUnknown(), "ind.escape"); } if (ScalarTy->isFloatingPointTy()) { const auto &ID = WideIV->getInductionDescriptor(); @@ -910,10 +915,10 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { if (!ExpR) continue; - auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR}); - if (I.second) + const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR); + if (Inserted) continue; - ExpR->replaceAllUsesWith(I.first->second); + ExpR->replaceAllUsesWith(V->second); ExpR->eraseFromParent(); } } @@ -1067,7 +1072,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X // && (Y || Z) and (X || !X) into true. This requires queuing newly created // recipes to be visited during simplification. - VPValue *X, *Y; + VPValue *X, *Y, *Z; if (match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) { @@ -1076,13 +1081,37 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - // OR x, 1 -> 1. - if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) { - Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1) - : Def->getOperand(0)); - Def->eraseFromParent(); - return; - } + // x | 1 -> 1 + if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) + return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X)); + + // x | 0 -> x + if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) + return Def->replaceAllUsesWith(X); + + // x & 0 -> 0 + if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) + return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X)); + + // x && false -> false + if (match(Def, m_LogicalAnd(m_VPValue(X), m_False()))) + return Def->replaceAllUsesWith(Def->getOperand(1)); + + // (x && y) || (x && z) -> x && (y || z) + VPBuilder Builder(Def); + if (match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_LogicalAnd(m_Deferred(X), m_VPValue(Z)))) && + // Simplify only if one of the operands has one use to avoid creating an + // extra recipe. + (!Def->getOperand(0)->hasMoreThanOneUniqueUser() || + !Def->getOperand(1)->hasMoreThanOneUniqueUser())) + return Def->replaceAllUsesWith( + Builder.createLogicalAnd(X, Builder.createOr(Y, Z))); + + // x && !x -> 0 + if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) + return Def->replaceAllUsesWith(Plan->getOrAddLiveIn( + ConstantInt::getFalse(VPTypeAnalysis(*Plan).inferScalarType(Def)))); if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) return Def->replaceAllUsesWith(X); @@ -1096,6 +1125,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With + // tail folding it is likely that x is a header mask and can be simplified + // further. + if (match(Def, m_LogicalAnd(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_VPValue(Z))) && + X->hasMoreThanOneUniqueUser()) + return Def->replaceAllUsesWith( + Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z))); + if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return Def->replaceAllUsesWith(A); @@ -1150,7 +1188,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { m_VPValue(X), m_SpecificInt(1)))) { Type *WideStepTy = TypeInfo.inferScalarType(Def); if (TypeInfo.inferScalarType(X) != WideStepTy) - X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy); + X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy); Def->replaceAllUsesWith(X); return; } @@ -1232,11 +1270,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - VPInstruction *OpVPI; - if (match(Def, m_ExtractLastElement(m_VPInstruction(OpVPI))) && - OpVPI->isVectorToScalar()) { - Def->replaceAllUsesWith(OpVPI); - return; + if (match(Def, + m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) && + vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) { + return U->usesScalars(A) || Def == U; + })) { + return Def->replaceAllUsesWith(A); } } @@ -1269,11 +1308,29 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { continue; auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R); + if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) && + vputils::isSingleScalar(RepR->getOperand(1))) { + auto *Clone = new VPReplicateRecipe( + RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(), + true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/); + Clone->insertBefore(RepOrWidenR); + auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement, + {Clone->getOperand(0)}); + Ext->insertBefore(Clone); + Clone->setOperand(0, Ext); + RepR->eraseFromParent(); + continue; + } + // Skip recipes that aren't single scalars or don't have only their // scalar results used. In the latter case, we would introduce extra // broadcasts. if (!vputils::isSingleScalar(RepOrWidenR) || - !vputils::onlyScalarValuesUsed(RepOrWidenR)) + !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) { + return U->usesScalars(RepOrWidenR) || + match(cast<VPRecipeBase>(U), + m_ExtractLastElement(m_VPValue())); + })) continue; auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(), @@ -1285,6 +1342,23 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { } } +/// Try to see if all of \p Blend's masks share a common value logically and'ed +/// and remove it from the masks. +static void removeCommonBlendMask(VPBlendRecipe *Blend) { + if (Blend->isNormalized()) + return; + VPValue *CommonEdgeMask; + if (!match(Blend->getMask(0), + m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue()))) + return; + for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++) + if (!match(Blend->getMask(I), + m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue()))) + return; + for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++) + Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1)); +} + /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes /// to make sure the masks are simplified. static void simplifyBlends(VPlan &Plan) { @@ -1295,6 +1369,8 @@ static void simplifyBlends(VPlan &Plan) { if (!Blend) continue; + removeCommonBlendMask(Blend); + // Try to remove redundant blend recipes. SmallPtrSet<VPValue *, 4> UniqueValues; if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) @@ -1467,6 +1543,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C); } +/// Try to replace multiple active lane masks used for control flow with +/// a single, wide active lane mask instruction followed by multiple +/// extract subvector intrinsics. This applies to the active lane mask +/// instructions both in the loop and in the preheader. +/// Incoming values of all ActiveLaneMaskPHIs are updated to use the +/// new extracts from the first active lane mask, which has it's last +/// operand (multiplier) set to UF. +static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, + unsigned UF) { + if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1) + return false; + + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); + auto *Term = &ExitingVPBB->back(); + + using namespace llvm::VPlanPatternMatch; + if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue()))))) + return false; + + auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); + LLVMContext &Ctx = Plan.getContext(); + + auto ExtractFromALM = [&](VPInstruction *ALM, + SmallVectorImpl<VPValue *> &Extracts) { + DebugLoc DL = ALM->getDebugLoc(); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<VPValue *> Ops; + Ops.append({ALM, Plan.getOrAddLiveIn( + ConstantInt::get(IntegerType::getInt64Ty(Ctx), + VF.getKnownMinValue() * Part))}); + auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops, + IntegerType::getInt1Ty(Ctx), DL); + Extracts[Part] = Ext; + Ext->insertAfter(ALM); + } + }; + + // Create a list of each active lane mask phi, ordered by unroll part. + SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr); + for (VPRecipeBase &R : Header->phis()) { + auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R); + if (!Phi) + continue; + VPValue *Index = nullptr; + match(Phi->getBackedgeValue(), + m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue())); + assert(Index && "Expected index from ActiveLaneMask instruction"); + + auto *II = dyn_cast<VPInstruction>(Index); + if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) { + auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue()); + Phis[Part->getZExtValue()] = Phi; + } else + // Anything other than a CanonicalIVIncrementForPart is part 0 + Phis[0] = Phi; + } + + assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && + "Expected one VPActiveLaneMaskPHIRecipe for each unroll part"); + + auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue()); + auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue()); + + assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask && + LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) && + "Expected incoming values of Phi to be ActiveLaneMasks"); + + // When using wide lane masks, the return type of the get.active.lane.mask + // intrinsic is VF x UF (last operand). + VPValue *ALMMultiplier = + Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + EntryALM->setOperand(2, ALMMultiplier); + LoopALM->setOperand(2, ALMMultiplier); + + // Create UF x extract vectors and insert into preheader. + SmallVector<VPValue *> EntryExtracts(UF); + ExtractFromALM(EntryALM, EntryExtracts); + + // Create UF x extract vectors and insert before the loop compare & branch, + // updating the compare to use the first extract. + SmallVector<VPValue *> LoopExtracts(UF); + ExtractFromALM(LoopALM, LoopExtracts); + VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0)); + Not->setOperand(0, LoopExtracts[0]); + + // Update the incoming values of active lane mask phis. + for (unsigned Part = 0; Part < UF; ++Part) { + Phis[Part]->setStartValue(EntryExtracts[Part]); + Phis[Part]->setBackedgeValue(LoopExtracts[Part]); + } + + return true; +} + /// Try to simplify the branch condition of \p Plan. This may restrict the /// resulting plan to \p BestVF and \p BestUF. static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, @@ -1478,8 +1650,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, VPValue *Cond; ScalarEvolution &SE = *PSE.getSE(); if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || - match(Term, m_BranchOnCond( - m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { + match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue()))))) { // Try to simplify the branch condition if TC <= VF * UF when the latch // terminator is BranchOnCount or BranchOnCond where the input is // Not(ActiveLaneMask). @@ -1558,8 +1730,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - bool MadeChange = - simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); + bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF); + MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); if (MadeChange) { @@ -1792,6 +1964,110 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { } } +namespace { +struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> { + static bool isSentinel(const VPSingleDefRecipe *Def) { + return Def == getEmptyKey() || Def == getTombstoneKey(); + } + + /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R. + /// Returns an optional pair, where the first element indicates whether it is + /// an intrinsic ID. + static std::optional<std::pair<bool, unsigned>> + getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { + return TypeSwitch<const VPSingleDefRecipe *, + std::optional<std::pair<bool, unsigned>>>(R) + .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, + VPWidenSelectRecipe, VPReplicateRecipe>( + [](auto *I) { return std::make_pair(false, I->getOpcode()); }) + .Case<VPWidenIntrinsicRecipe>([](auto *I) { + return std::make_pair(true, I->getVectorIntrinsicID()); + }) + .Default([](auto *) { return std::nullopt; }); + } + + /// Returns true if recipe \p Def can be safely handed for CSE. + static bool canHandle(const VPSingleDefRecipe *Def) { + // We can extend the list of handled recipes in the future, + // provided we account for the data embedded in them while checking for + // equality or hashing. + auto C = getOpcodeOrIntrinsicID(Def); + + // The issue with (Insert|Extract)Value is that the index of the + // insert/extract is not a proper operand in LLVM IR, and hence also not in + // VPlan. + if (!C || (!C->first && (C->second == Instruction::InsertValue || + C->second == Instruction::ExtractValue))) + return false; + + // During CSE, we can only handle recipes that don't read from memory: if + // they read from memory, there could be an intervening write to memory + // before the next instance is CSE'd, leading to an incorrect result. + return !Def->mayReadFromMemory(); + } + + /// Hash the underlying data of \p Def. + static unsigned getHashValue(const VPSingleDefRecipe *Def) { + const VPlan *Plan = Def->getParent()->getPlan(); + VPTypeAnalysis TypeInfo(*Plan); + hash_code Result = hash_combine( + Def->getVPDefID(), getOpcodeOrIntrinsicID(Def), + TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def), + hash_combine_range(Def->operands())); + if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def)) + if (RFlags->hasPredicate()) + return hash_combine(Result, RFlags->getPredicate()); + return Result; + } + + /// Check equality of underlying data of \p L and \p R. + static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) { + if (isSentinel(L) || isSentinel(R)) + return L == R; + if (L->getVPDefID() != R->getVPDefID() || + getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) || + vputils::isSingleScalar(L) != vputils::isSingleScalar(R) || + !equal(L->operands(), R->operands())) + return false; + if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L)) + if (LFlags->hasPredicate() && + LFlags->getPredicate() != + cast<VPRecipeWithIRFlags>(R)->getPredicate()) + return false; + const VPlan *Plan = L->getParent()->getPlan(); + VPTypeAnalysis TypeInfo(*Plan); + return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R); + } +}; +} // end anonymous namespace + +/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p +/// Plan. +void VPlanTransforms::cse(VPlan &Plan) { + VPDominatorTree VPDT(Plan); + DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap; + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_deep(Plan.getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + auto *Def = dyn_cast<VPSingleDefRecipe>(&R); + if (!Def || !VPCSEDenseMapInfo::canHandle(Def)) + continue; + if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) { + // V must dominate Def for a valid replacement. + if (!VPDT.dominates(V->getParent(), VPBB)) + continue; + // Only keep flags present on both V and Def. + if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V)) + RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def)); + Def->replaceAllUsesWith(V); + continue; + } + CSEMap[Def] = Def; + } + } +} + /// Move loop-invariant recipes out of the vector loop region in \p Plan. static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); @@ -1953,10 +2229,10 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeRedundantInductionCasts, Plan); runPass(simplifyRecipes, Plan); - runPass(simplifyBlends, Plan); runPass(removeDeadRecipes, Plan); - runPass(narrowToSingleScalarRecipes, Plan); + runPass(simplifyBlends, Plan); runPass(legalizeAndOptimizeInductions, Plan); + runPass(narrowToSingleScalarRecipes, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); runPass(simplifyRecipes, Plan); runPass(removeBranchOnConst, Plan); @@ -2042,13 +2318,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - auto *EntryALM = - Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, - DL, "active.lane.mask.entry"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {EntryIncrement, TC, ALMMultiplier}, DL, + "active.lane.mask.entry"); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. - auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + auto *LaneMaskPhi = + new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown()); LaneMaskPhi->insertAfter(CanonicalIVPHI); // Create the active lane mask for the next iteration of the loop before the @@ -2059,8 +2338,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, {false, false}, DL); auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, - {InLoopIncrement, TripCount}, DL, - "active.lane.mask.next"); + {InLoopIncrement, TripCount, ALMMultiplier}, + DL, "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); // Replace the original terminator with BranchOnCond. We have to invert the @@ -2077,12 +2356,10 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( /// for the header-mask pattern manually. static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) { SmallVector<VPValue *> WideCanonicalIVs; - auto *FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); + auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), + IsaPred<VPWidenCanonicalIVRecipe>); assert(count_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <= - 1 && + IsaPred<VPWidenCanonicalIVRecipe>) <= 1 && "Must have at most one VPWideCanonicalIVRecipe"); if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { auto *WideCanonicalIV = @@ -2125,9 +2402,8 @@ void VPlanTransforms::addActiveLaneMask( "DataAndControlFlowWithoutRuntimeCheck implies " "UseActiveLaneMaskForControlFlow"); - auto *FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); + auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), + IsaPred<VPWidenCanonicalIVRecipe>); assert(FoundWidenCanonicalIVUser && "Must have widened canonical IV when tail folding!"); VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan); @@ -2139,9 +2415,12 @@ void VPlanTransforms::addActiveLaneMask( Plan, DataAndControlFlowWithoutRuntimeCheck); } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); - LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, - {WideCanonicalIV, Plan.getTripCount()}, nullptr, - "active.lane.mask"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + LaneMask = + B.createNaryOp(VPInstruction::ActiveLaneMask, + {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, + nullptr, "active.lane.mask"); } // Walk users of WideCanonicalIV and replace the header mask of the form @@ -2205,6 +2484,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPValue *NewAddr = GetNewAddr(S->getAddr()); return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); }) + .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { + VPValue *NewMask = GetNewMask(IR->getMask()); + return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); + }) .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { VPValue *NewMask = GetNewMask(Red->getCondOp()); return new VPReductionEVLRecipe(*Red, EVL, NewMask); @@ -2271,11 +2554,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPBuilder Builder(LoopRegion->getPreheaderVPBB()); MaxEVL = Builder.createScalarZExtOrTrunc( MaxEVL, Type::getInt32Ty(Plan.getContext()), - TypeInfo.inferScalarType(MaxEVL), DebugLoc()); + TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown()); Builder.setInsertPoint(Header, Header->getFirstNonPhi()); - VPValue *PrevEVL = - Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl"); + VPValue *PrevEVL = Builder.createScalarPhi( + {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) { @@ -2327,16 +2610,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { if (!EVLRecipe) continue; - [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); + unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); assert(NumDefVal == CurRecipe->getNumDefinedValues() && "New recipe must define the same number of values as the " "original."); - assert(NumDefVal <= 1 && - "Only supports recipes with a single definition or without users."); EVLRecipe->insertBefore(CurRecipe); - if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) { - VPValue *CurVPV = CurRecipe->getVPSingleValue(); - CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); + if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>( + EVLRecipe)) { + for (unsigned I = 0; I < NumDefVal; ++I) { + VPValue *CurVPV = CurRecipe->getVPValue(I); + CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I)); + } } ToErase.push_back(CurRecipe); } @@ -2404,7 +2688,7 @@ void VPlanTransforms::addExplicitVectorLength( VPValue *StartV = CanonicalIVPHI->getStartValue(); // Create the ExplicitVectorLengthPhi recipe in the main loop. - auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); + auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown()); EVLPhi->insertAfter(CanonicalIVPHI); VPBuilder Builder(Header, Header->getFirstNonPhi()); // Create the AVL (application vector length), starting from TC -> 0 in steps @@ -2418,10 +2702,11 @@ void VPlanTransforms::addExplicitVectorLength( VPValue *AVLSafe = Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements)); VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); - AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl"); + AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(), + "safe_avl"); } auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, - DebugLoc()); + DebugLoc::getUnknown()); auto *CanonicalIVIncrement = cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); @@ -2473,6 +2758,22 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { VPBasicBlock *HeaderVPBB = EVLPhi->getParent(); VPValue *EVLIncrement = EVLPhi->getBackedgeValue(); + VPValue *AVL; + [[maybe_unused]] bool FoundAVL = + match(EVLIncrement, + m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))); + assert(FoundAVL && "Didn't find AVL?"); + + // The AVL may be capped to a safe distance. + VPValue *SafeAVL; + if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue()))) + AVL = SafeAVL; + + VPValue *AVLNext; + [[maybe_unused]] bool FoundAVLNext = + match(AVL, m_VPInstruction<Instruction::PHI>( + m_Specific(Plan.getTripCount()), m_VPValue(AVLNext))); + assert(FoundAVLNext && "Didn't find AVL backedge?"); // Convert EVLPhi to concrete recipe. auto *ScalarR = @@ -2496,7 +2797,7 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { // Replace the use of VectorTripCount in the latch-exiting block. // Before: (branch-on-count EVLIVInc, VectorTripCount) - // After: (branch-on-count EVLIVInc, TripCount) + // After: (branch-on-cond eq AVLNext, 0) VPBasicBlock *LatchExiting = HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock(); @@ -2509,7 +2810,54 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { m_BranchOnCount(m_VPValue(EVLIncrement), m_Specific(&Plan.getVectorTripCount()))) && "Unexpected terminator in EVL loop"); - LatchExitingBr->setOperand(1, Plan.getTripCount()); + + Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext); + VPBuilder Builder(LatchExitingBr); + VPValue *Cmp = + Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, + Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy))); + Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp); + LatchExitingBr->eraseFromParent(); +} + +void VPlanTransforms::replaceSymbolicStrides( + VPlan &Plan, PredicatedScalarEvolution &PSE, + const DenseMap<Value *, const SCEV *> &StridesMap) { + // Replace VPValues for known constant strides guaranteed by predicate scalar + // evolution. + auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { + auto *R = cast<VPRecipeBase>(&U); + return R->getParent()->getParent() || + R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); + }; + for (const SCEV *Stride : StridesMap.values()) { + using namespace SCEVPatternMatch; + auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); + const APInt *StrideConst; + if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst))) + // Only handle constant strides for now. + continue; + + auto *CI = + Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst)); + if (VPValue *StrideVPV = Plan.getLiveIn(StrideV)) + StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); + + // The versioned value may not be used in the loop directly but through a + // sext/zext. Add new live-ins in those cases. + for (Value *U : StrideV->users()) { + if (!isa<SExtInst, ZExtInst>(U)) + continue; + VPValue *StrideVPV = Plan.getLiveIn(U); + if (!StrideVPV) + continue; + unsigned BW = U->getType()->getScalarSizeInBits(); + APInt C = + isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW); + VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); + StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); + } + } } void VPlanTransforms::dropPoisonGeneratingRecipes( @@ -2785,8 +3133,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step); Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags); - Init = - Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction"); + Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, + DebugLoc::getUnknown(), "induction"); // Create the widened phi of the vector IV. auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr, @@ -2983,9 +3331,11 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { R->eraseFromParent(); } -void VPlanTransforms::handleUncountableEarlyExit( - VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, - VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) { +void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, + VPBasicBlock *EarlyExitVPBB, + VPlan &Plan, + VPBasicBlock *HeaderVPBB, + VPBasicBlock *LatchVPBB) { VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0]; if (!EarlyExitVPBB->getSinglePredecessor() && EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) { @@ -3038,13 +3388,7 @@ void VPlanTransforms::handleUncountableEarlyExit( } VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx); - auto IsVector = [](ElementCount VF) { return VF.isVector(); }; - // When the VFs are vectors, need to add `extract` to get the incoming value - // from early exit. When the range contains scalar VF, limit the range to - // scalar VF to prevent mis-compilation for the range containing both scalar - // and vector VFs. - if (!IncomingFromEarlyExit->isLiveIn() && - LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) { + if (!IncomingFromEarlyExit->isLiveIn()) { // Update the incoming value from the early exit. VPValue *FirstActiveLane = EarlyExitB.createNaryOp( VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr, @@ -3125,7 +3469,7 @@ static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); - if (Opcode != Instruction::Add) + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return nullptr; Type *RedTy = Ctx.Types.inferScalarType(Red); @@ -3140,8 +3484,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Type *SrcTy = Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); - InstructionCost MulAccCost = - Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind); + InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost( + isZExt, Opcode, RedTy, SrcVecTy, CostKind); InstructionCost MulCost = Mul->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); InstructionCost ExtCost = 0; @@ -3506,6 +3850,21 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { Plan.resetTripCount(Exp); ExpSCEV->eraseFromParent(); } + assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) && + "VPExpandSCEVRecipes must be at the beginning of the entry block, " + "after any VPIRInstructions"); + // Add IR instructions in the entry basic block but not in the VPIRBasicBlock + // to the VPIRBasicBlock. + auto EI = Entry->begin(); + for (Instruction &I : drop_end(*EntryBB)) { + if (EI != Entry->end() && isa<VPIRInstruction>(*EI) && + &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) { + EI++; + continue; + } + VPIRInstruction::create(I)->insertBefore(*Entry, EI); + } + return ExpandedSCEVs; } @@ -3574,12 +3933,12 @@ static bool isAlreadyNarrow(VPValue *VPV) { void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); - if (VF.isScalable() || !VectorLoop) + if (!VectorLoop) return; VPTypeAnalysis TypeInfo(Plan); - unsigned FixedVF = VF.getFixedValue(); + unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { if (isa<VPCanonicalIVPHIRecipe>(&R) || @@ -3615,7 +3974,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, continue; // Bail out on non-consecutive interleave groups. - if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo, + if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo, VectorRegWidth)) return; @@ -3672,9 +4031,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, return; // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. - auto NarrowOp = [](VPValue *V) -> VPValue * { + SmallPtrSet<VPValue *, 4> NarrowedOps; + auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * { auto *R = V->getDefiningRecipe(); - if (!R) + if (!R || NarrowedOps.contains(V)) return V; if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { // Narrow interleave group to wide load, as transformed VPlan will only @@ -3684,6 +4044,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); + NarrowedOps.insert(L); return L; } @@ -3691,6 +4052,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, assert(RepR->isSingleScalar() && isa<LoadInst>(RepR->getUnderlyingInstr()) && "must be a single scalar load"); + NarrowedOps.insert(RepR); return RepR; } auto *WideLoad = cast<VPWidenLoadRecipe>(R); @@ -3704,6 +4066,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, /*IsUniform*/ true, /*Mask*/ nullptr, *WideLoad); N->insertBefore(WideLoad); + NarrowedOps.insert(N); return N; }; @@ -3734,10 +4097,21 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // original iteration. auto *CanIV = Plan.getCanonicalIV(); auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); - Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get( - CanIV->getScalarType(), 1 * Plan.getUF()))); - Plan.getVF().replaceAllUsesWith( - Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + VPBuilder PHBuilder(Plan.getVectorPreheader()); + + VPValue *UF = Plan.getOrAddLiveIn( + ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); + if (VF.isScalable()) { + VPValue *VScale = PHBuilder.createElementCount( + CanIV->getScalarType(), ElementCount::getScalable(1)); + VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); + Inc->setOperand(1, VScaleUF); + Plan.getVF().replaceAllUsesWith(VScale); + } else { + Inc->setOperand(1, UF); + Plan.getVF().replaceAllUsesWith( + Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + } removeDeadRecipes(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 700b94621d5f..1957428fab79 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -62,16 +62,47 @@ struct VPlanTransforms { /// The created loop is wrapped in an initial skeleton to facilitate /// vectorization, consisting of a vector pre-header, an exit block for the /// main vector loop (middle.block) and a new block as preheader of the scalar - /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p - /// InductionTy and \p IVDL, and creates a VPValue expression for the original - /// trip count. + /// loop (scalar.ph). See below for an illustration. It also adds a canonical + /// IV and its increment, using \p InductionTy and \p IVDL, and creates a + /// VPValue expression for the original trip count. + /// + /// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's + /// / \ old preheader. Will contain iteration number check and SCEV + /// | | expansions. + /// | | + /// / v + /// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be + /// | / | added later. + /// | / v + /// || [ ] <-- vector pre header. + /// |/ | + /// | v + /// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized. + /// | [ ]_| + /// | | + /// | v + /// | [ ] <--- middle-block with the branch to successors + /// | / | + /// | / | + /// | | v + /// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be + /// | | replaced later by a VPIRBasicBlock wrapping the scalar + /// | | preheader basic block. + /// | | + /// v <-- edge from middle to exit iff epilogue is not required. + /// | [ ] \ + /// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, + /// | | header wrapped in VPIRBasicBlock). + /// \ | + /// \ v + /// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks. LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan> buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE); /// Update \p Plan to account for all early exits. - LLVM_ABI_FOR_TEST static void - handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range); + LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan, + bool HasUncountableExit); /// If a check is needed to guard executing the scalar epilogue loop, it will /// be added to the middle block. @@ -79,6 +110,13 @@ struct VPlanTransforms { bool RequiresScalarEpilogueCheck, bool TailFolded); + // Create a check to \p Plan to see if the vector loop should be executed. + static void addMinimumIterationCheck( + VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, + bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, + const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE); + /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's /// flat CFG into a hierarchical CFG. LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan); @@ -161,6 +199,12 @@ struct VPlanTransforms { truncateToMinimalBitwidths(VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs); + /// Replace symbolic strides from \p StridesMap in \p Plan with constants when + /// possible. + static void + replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, + const DenseMap<Value *, const SCEV *> &StridesMap); + /// Drop poison flags from recipes that may generate a poison value that is /// used after vectorization, even when their operands are not poison. Those /// recipes meet the following conditions: @@ -207,8 +251,7 @@ struct VPlanTransforms { static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, - VPBasicBlock *LatchVPBB, - VFRange &Range); + VPBasicBlock *LatchVPBB); /// Replace loop regions with explicit CFG. static void dissolveLoopRegions(VPlan &Plan); @@ -220,9 +263,10 @@ struct VPlanTransforms { /// variable vector lengths instead of fixed lengths. This transformation: /// * Makes EVL-Phi concrete. // * Removes CanonicalIV and increment. - /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc, - /// VectorTripCount) with variable-length stepping (branch-on-cond - /// EVLIVInc, TripCount). + /// * Replaces the exit condition from + /// (branch-on-count CanonicalIVInc, VectorTripCount) + /// to + /// (branch-on-cond eq AVLNext, 0) static void canonicalizeEVLLoops(VPlan &Plan); /// Lower abstract recipes to concrete ones, that can be codegen'd. @@ -242,6 +286,9 @@ struct VPlanTransforms { /// removing dead edges to their successors. static void removeBranchOnConst(VPlan &Plan); + /// Perform common-subexpression-elimination on \p Plan. + static void cse(VPlan &Plan); + /// If there's a single exit block, optimize its phi recipes that use exiting /// IV values by feeding them precomputed end values instead, possibly taken /// one step backwards. diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 4bcde8cd5d42..443df167378b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -92,18 +92,18 @@ public: void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, unsigned Part) { for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) { - auto Ins = VPV2Parts.insert({VPV, {}}); - assert(Ins.first->second.size() == Part - 1 && "earlier parts not set"); - Ins.first->second.push_back(CopyR->getVPValue(Idx)); + const auto &[V, _] = VPV2Parts.try_emplace(VPV); + assert(V->second.size() == Part - 1 && "earlier parts not set"); + V->second.push_back(CopyR->getVPValue(Idx)); } } /// Given a uniform recipe \p R, add it for all parts. void addUniformForAllParts(VPSingleDefRecipe *R) { - auto Ins = VPV2Parts.insert({R, {}}); - assert(Ins.second && "uniform value already added"); + const auto &[V, Inserted] = VPV2Parts.try_emplace(R); + assert(Inserted && "uniform value already added"); for (unsigned Part = 0; Part != UF; ++Part) - Ins.first->second.push_back(R); + V->second.push_back(R); } bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); } @@ -536,16 +536,9 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { VPBuilder Builder(RepR); if (RepR->getNumUsers() == 0) { - if (isa<StoreInst>(RepR->getUnderlyingInstr()) && - vputils::isSingleScalar(RepR->getOperand(1))) { - // Stores to invariant addresses need to store the last lane only. - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF), - Def2LaneDefs); - } else { - // Create single-scalar version of RepR for all lanes. - for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs); - } + // Create single-scalar version of RepR for all lanes. + for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs); RepR->eraseFromParent(); continue; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 700a733bf9f2..c6c1ef336982 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -65,7 +65,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { VPValue *A, *B; using namespace VPlanPatternMatch; - if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B)))) + if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1)))) return B == Plan.getTripCount() && (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_SpecificInt(1), diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 9e1d325a4d8d..77c099b27171 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -49,6 +49,8 @@ inline bool isSingleScalar(const VPValue *VPV) { case Instruction::GetElementPtr: case Instruction::ICmp: case Instruction::FCmp: + case Instruction::Select: + case VPInstruction::Not: case VPInstruction::Broadcast: case VPInstruction::PtrAdd: return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 24f6d61512ef..85c6c2c8d796 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -38,7 +38,7 @@ struct VPDoubleValueDef; class VPSlotTracker; class VPUser; class VPRecipeBase; -class VPInterleaveRecipe; +class VPInterleaveBase; class VPPhiAccessors; // This is the base class of the VPlan Def/Use graph, used for modeling the data @@ -48,7 +48,7 @@ class VPPhiAccessors; class LLVM_ABI_FOR_TEST VPValue { friend class VPDef; friend struct VPDoubleValueDef; - friend class VPInterleaveRecipe; + friend class VPInterleaveBase; friend class VPlan; friend class VPExpressionRecipe; @@ -335,6 +335,7 @@ public: VPExpressionSC, VPIRInstructionSC, VPInstructionSC, + VPInterleaveEVLSC, VPInterleaveSC, VPReductionEVLSC, VPReductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index e25ffe135418..99f3bc367a54 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { } return VerifyEVLUse(*R, 2); }) - .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>( + .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe, + VPInterleaveEVLRecipe>( [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); }) .Case<VPInstructionWithType>( [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); }) @@ -412,7 +413,7 @@ bool VPlanVerifier::verifyRegion(const VPRegionBlock *Region) { const VPBlockBase *Exiting = Region->getExiting(); // Entry and Exiting shouldn't have any predecessor/successor, respectively. - if (Entry->getNumPredecessors() != 0) { + if (Entry->hasPredecessors()) { errs() << "region entry block has predecessors\n"; return false; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 092a3a87954f..17cb18a22336 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -99,6 +99,10 @@ private: InstructionWorklist Worklist; + /// Next instruction to iterate. It will be updated when it is erased by + /// RecursivelyDeleteTriviallyDeadInstructions. + Instruction *NextInst; + // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" // parameter. That should be updated to specific sub-classes because the // run loop was changed to dispatch on opcode. @@ -118,6 +122,7 @@ private: bool foldInsExtBinop(Instruction &I); bool foldInsExtVectorToShuffle(Instruction &I); bool foldBitOpOfCastops(Instruction &I); + bool foldBitOpOfCastConstant(Instruction &I); bool foldBitcastShuffle(Instruction &I); bool scalarizeOpOrCmp(Instruction &I); bool scalarizeVPIntrinsic(Instruction &I); @@ -169,13 +174,16 @@ private: // further folds that were hindered by OneUse limits. SmallPtrSet<Value *, 4> Visited; for (Value *Op : Ops) { - if (Visited.insert(Op).second) { + if (!Visited.contains(Op)) { if (auto *OpI = dyn_cast<Instruction>(Op)) { if (RecursivelyDeleteTriviallyDeadInstructions( - OpI, nullptr, nullptr, [this](Value *V) { - if (auto I = dyn_cast<Instruction>(V)) { + OpI, nullptr, nullptr, [&](Value *V) { + if (auto *I = dyn_cast<Instruction>(V)) { LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n'); Worklist.remove(I); + if (I == NextInst) + NextInst = NextInst->getNextNode(); + Visited.insert(I); } })) continue; @@ -862,14 +870,17 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) { if (LHSSrc->getType() != RHSSrc->getType()) return false; - // Only handle vector types with integer elements - auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType()); - auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType()); - if (!SrcVecTy || !DstVecTy) + auto *SrcTy = LHSSrc->getType(); + auto *DstTy = I.getType(); + // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>. + // Other casts only handle vector types with integer elements. + if (CastOpcode != Instruction::BitCast && + (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy))) return false; - if (!SrcVecTy->getScalarType()->isIntegerTy() || - !DstVecTy->getScalarType()->isIntegerTy()) + // Only integer scalar/vector values are legal for bitwise logic operations. + if (!SrcTy->getScalarType()->isIntegerTy() || + !DstTy->getScalarType()->isIntegerTy()) return false; // Cost Check : @@ -877,23 +888,21 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) { // NewCost = bitlogic + cast // Calculate specific costs for each cast with instruction context - InstructionCost LHSCastCost = - TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy, - TTI::CastContextHint::None, CostKind, LHSCast); - InstructionCost RHSCastCost = - TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy, - TTI::CastContextHint::None, CostKind, RHSCast); + InstructionCost LHSCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast); + InstructionCost RHSCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast); InstructionCost OldCost = - TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) + + TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) + LHSCastCost + RHSCastCost; // For new cost, we can't provide an instruction (it doesn't exist yet) InstructionCost GenericCastCost = TTI.getCastInstrCost( - CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind); + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind); InstructionCost NewCost = - TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) + + TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) + GenericCastCost; // Account for multi-use casts using specific costs @@ -930,6 +939,102 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) { return true; } +/// Match: +// bitop(castop(x), C) -> +// bitop(castop(x), castop(InvC)) -> +// castop(bitop(x, InvC)) +// Supports: bitcast +bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) { + Instruction *LHS; + Constant *C; + + // Check if this is a bitwise logic operation + if (!match(&I, m_c_BitwiseLogic(m_Instruction(LHS), m_Constant(C)))) + return false; + + // Get the cast instructions + auto *LHSCast = dyn_cast<CastInst>(LHS); + if (!LHSCast) + return false; + + Instruction::CastOps CastOpcode = LHSCast->getOpcode(); + + // Only handle supported cast operations + switch (CastOpcode) { + case Instruction::BitCast: + break; + default: + return false; + } + + Value *LHSSrc = LHSCast->getOperand(0); + + auto *SrcTy = LHSSrc->getType(); + auto *DstTy = I.getType(); + // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>. + // Other casts only handle vector types with integer elements. + if (CastOpcode != Instruction::BitCast && + (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy))) + return false; + + // Only integer scalar/vector values are legal for bitwise logic operations. + if (!SrcTy->getScalarType()->isIntegerTy() || + !DstTy->getScalarType()->isIntegerTy()) + return false; + + // Find the constant InvC, such that castop(InvC) equals to C. + PreservedCastFlags RHSFlags; + Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags); + if (!InvC) + return false; + + // Cost Check : + // OldCost = bitlogic + cast + // NewCost = bitlogic + cast + + // Calculate specific costs for each cast with instruction context + InstructionCost LHSCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast); + + InstructionCost OldCost = + TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost; + + // For new cost, we can't provide an instruction (it doesn't exist yet) + InstructionCost GenericCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind); + + InstructionCost NewCost = + TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) + + GenericCastCost; + + // Account for multi-use casts using specific costs + if (!LHSCast->hasOneUse()) + NewCost += LHSCastCost; + + LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost + << " NewCost=" << NewCost << "\n"); + + if (NewCost > OldCost) + return false; + + // Create the operation on the source type + Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), + LHSSrc, InvC, I.getName() + ".inner"); + if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp)) + NewBinOp->copyIRFlags(&I); + + Worklist.pushValue(NewOp); + + // Create the cast operation directly to ensure we get a new instruction + Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType()); + + // Insert the new instruction + Value *Result = Builder.Insert(NewCast); + + replaceValue(I, *Result); + return true; +} + /// If this is a bitcast of a shuffle, try to bitcast the source vector to the /// destination type followed by shuffle. This can enable further transforms by /// moving bitcasts or shuffles together. @@ -1461,8 +1566,8 @@ static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::CastContextHint::None, CostKind, RedOp); CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost; - CostAfterReduction = - TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind); + CostAfterReduction = TTI.getMulAccReductionCost( + IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind); return; } CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy, @@ -3753,6 +3858,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { unsigned MaxVectorSize = TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); unsigned MaxElementsInVector = MaxVectorSize / ElementSize; + if (MaxElementsInVector == 0) + return false; // When there are multiple shufflevector operations on the same input, // especially when the vector length is larger than the register size, // identical shuffle patterns may occur across different groups of elements. @@ -4467,6 +4574,8 @@ bool VectorCombine::run() { case Instruction::Xor: if (foldBitOpOfCastops(I)) return true; + if (foldBitOpOfCastConstant(I)) + return true; break; case Instruction::PHI: if (shrinkPhiOfShuffles(I)) @@ -4519,13 +4628,21 @@ bool VectorCombine::run() { if (!DT.isReachableFromEntry(&BB)) continue; // Use early increment range so that we can erase instructions in loop. - for (Instruction &I : make_early_inc_range(BB)) { - if (I.isDebugOrPseudoInst()) - continue; - MadeChange |= FoldInst(I); + // make_early_inc_range is not applicable here, as the next iterator may + // be invalidated by RecursivelyDeleteTriviallyDeadInstructions. + // We manually maintain the next instruction and update it when it is about + // to be deleted. + Instruction *I = &BB.front(); + while (I) { + NextInst = I->getNextNode(); + if (!I->isDebugOrPseudoInst()) + MadeChange |= FoldInst(*I); + I = NextInst; } } + NextInst = nullptr; + while (!Worklist.isEmpty()) { Instruction *I = Worklist.removeOne(); if (!I) |
