diff options
| author | Michael Kruse <llvm-project@meinersbur.de> | 2025-01-03 10:22:51 +0100 |
|---|---|---|
| committer | Michael Kruse <llvm-project@meinersbur.de> | 2025-01-03 10:22:51 +0100 |
| commit | 38500d63e14ce340236840f60d356cdefb56a52c (patch) | |
| tree | 17edbec446ce9b50d2f215a483b83afb293a635d /llvm/lib/Transforms | |
| parent | 1a3d5daaef7a6a63448a497da3eff7fc9e23df26 (diff) | |
| parent | 27f30029741ecf023baece7b3dde1ff9011ffefc (diff) | |
Merge branch 'main' into users/meinersbur/flang_runtime_split-headersusers/meinersbur/flang_runtime_split-headers
Diffstat (limited to 'llvm/lib/Transforms')
79 files changed, 5080 insertions, 2975 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 45ee2d472a11..12ae6740e055 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -181,6 +181,7 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain /// of 'and' ops, then we also need to capture the fact that we saw an /// "and X, 1", so that's an extra return value for that case. +namespace { struct MaskOps { Value *Root = nullptr; APInt Mask; @@ -190,6 +191,7 @@ struct MaskOps { MaskOps(unsigned BitWidth, bool MatchAnds) : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {} }; +} // namespace /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a /// chain of 'and' or 'or' instructions looking for shift ops of a common source diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp index 0baa34d50abf..20fc630a74a8 100644 --- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp +++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp @@ -32,10 +32,9 @@ namespace { bool AlwaysInlineImpl( Module &M, bool InsertLifetime, ProfileSummaryInfo &PSI, + FunctionAnalysisManager *FAM, function_ref<AssumptionCache &(Function &)> GetAssumptionCache, - function_ref<AAResults &(Function &)> GetAAR, - function_ref<BlockFrequencyInfo &(Function &)> GetBFI, - function_ref<BlockFrequencyInfo *(Function &)> GetCachedBFI) { + function_ref<AAResults &(Function &)> GetAAR) { SmallSetVector<CallBase *, 16> Calls; bool Changed = false; SmallVector<Function *, 16> InlinedComdatFunctions; @@ -62,12 +61,7 @@ bool AlwaysInlineImpl( DebugLoc DLoc = CB->getDebugLoc(); BasicBlock *Block = CB->getParent(); - // Only update CallerBFI if already available. The CallerBFI update - // requires CalleeBFI. - BlockFrequencyInfo *CallerBFI = GetCachedBFI(*Caller); - InlineFunctionInfo IFI(GetAssumptionCache, &PSI, CallerBFI, - CallerBFI ? &GetBFI(F) : nullptr); - + InlineFunctionInfo IFI(GetAssumptionCache, &PSI, nullptr, nullptr); InlineResult Res = InlineFunction(*CB, IFI, /*MergeAttributes=*/true, &GetAAR(F), InsertLifetime); if (!Res.isSuccess()) { @@ -86,6 +80,8 @@ bool AlwaysInlineImpl( /*ForProfileContext=*/false, DEBUG_TYPE); Changed = true; + if (FAM) + FAM->invalidate(*Caller, PreservedAnalyses::none()); } F.removeDeadConstantUsers(); @@ -95,6 +91,8 @@ bool AlwaysInlineImpl( if (F.hasComdat()) { InlinedComdatFunctions.push_back(&F); } else { + if (FAM) + FAM->clear(F, F.getName()); M.getFunctionList().erase(F); Changed = true; } @@ -107,6 +105,8 @@ bool AlwaysInlineImpl( filterDeadComdatFunctions(InlinedComdatFunctions); // The remaining functions are actually dead. for (Function *F : InlinedComdatFunctions) { + if (FAM) + FAM->clear(*F, F->getName()); M.getFunctionList().erase(F); Changed = true; } @@ -136,12 +136,9 @@ struct AlwaysInlinerLegacyPass : public ModulePass { auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); }; - auto GetCachedBFI = [](Function &) -> BlockFrequencyInfo * { - return nullptr; - }; - return AlwaysInlineImpl(M, InsertLifetime, PSI, GetAssumptionCache, GetAAR, - /*GetBFI=*/nullptr, GetCachedBFI); + return AlwaysInlineImpl(M, InsertLifetime, PSI, /*FAM=*/nullptr, + GetAssumptionCache, GetAAR); } static char ID; // Pass identification, replacement for typeid @@ -175,19 +172,18 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return FAM.getResult<AssumptionAnalysis>(F); }; - auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & { - return FAM.getResult<BlockFrequencyAnalysis>(F); - }; - auto GetCachedBFI = [&](Function &F) -> BlockFrequencyInfo * { - return FAM.getCachedResult<BlockFrequencyAnalysis>(F); - }; auto GetAAR = [&](Function &F) -> AAResults & { return FAM.getResult<AAManager>(F); }; auto &PSI = MAM.getResult<ProfileSummaryAnalysis>(M); - bool Changed = AlwaysInlineImpl(M, InsertLifetime, PSI, GetAssumptionCache, - GetAAR, GetBFI, GetCachedBFI); + bool Changed = AlwaysInlineImpl(M, InsertLifetime, PSI, &FAM, + GetAssumptionCache, GetAAR); + if (!Changed) + return PreservedAnalyses::all(); - return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + PreservedAnalyses PA; + // We have already invalidated all analyses on modified functions. + PA.preserveSet<AllAnalysesOn<Function>>(); + return PA; } diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index afb0ea72b269..fe9cca01a8f3 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -633,7 +633,7 @@ ArgumentAccessInfo getArgmentAccessInfo(const Instruction *I, [](Value *Length, std::optional<int64_t> Offset) -> std::optional<ConstantRange> { auto *ConstantLength = dyn_cast<ConstantInt>(Length); - if (ConstantLength && Offset) + if (ConstantLength && Offset && !ConstantLength->isNegative()) return ConstantRange( APInt(64, *Offset, true), APInt(64, *Offset + ConstantLength->getSExtValue(), true)); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index ea92c6e2f59e..1bf7ff468d78 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -140,6 +140,7 @@ cl::opt<bool> MemProfRequireDefinitionForPromotion( } // namespace llvm extern cl::opt<bool> MemProfReportHintedSizes; +extern cl::opt<unsigned> MinClonedColdBytePercent; namespace { /// CRTP base for graphs built from either IR or ThinLTO summary index. @@ -617,6 +618,11 @@ private: static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); } + /// Get the AllocationType assigned to the given allocation instruction clone. + AllocationType getAllocationCallType(const CallInfo &Call) const { + return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call); + } + /// Update non-allocation call to invoke (possibly cloned) function /// CalleeFunc. void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { @@ -711,7 +717,8 @@ private: /// Map from each contextID to the profiled full contexts and their total /// sizes (there may be more than one due to context trimming), - /// optionally populated when requested (via MemProfReportHintedSizes). + /// optionally populated when requested (via MemProfReportHintedSizes or + /// MinClonedColdBytePercent). DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos; /// Identifies the context node created for a stack id when adding the MIB @@ -773,6 +780,7 @@ private: uint64_t getLastStackId(Instruction *Call); std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + AllocationType getAllocationCallType(const CallInfo &Call) const; void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); CallsiteContextGraph<ModuleCallsiteContextGraph, Function, Instruction *>::FuncInfo @@ -852,6 +860,7 @@ private: uint64_t getLastStackId(IndexCall &Call); std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + AllocationType getAllocationCallType(const CallInfo &Call) const; void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, IndexCall>::FuncInfo @@ -896,21 +905,6 @@ struct DenseMapInfo<IndexCall> namespace { -struct FieldSeparator { - bool Skip = true; - const char *Sep; - - FieldSeparator(const char *Sep = ", ") : Sep(Sep) {} -}; - -raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) { - if (FS.Skip) { - FS.Skip = false; - return OS; - } - return OS << FS.Sep; -} - // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc // type we should actually use on the corresponding allocation. // If we can't clone a node that has NotCold+Cold alloc type, we will fall @@ -1216,8 +1210,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( ContextIdToAllocationType[++LastContextId] = AllocType; - if (MemProfReportHintedSizes) { - assert(!ContextSizeInfo.empty()); + if (!ContextSizeInfo.empty()) { auto &Entry = ContextIdToContextSizeInfos[LastContextId]; Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end()); } @@ -2058,14 +2051,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> EmptyContext; unsigned I = 0; - assert(!MemProfReportHintedSizes || - AN.ContextSizeInfos.size() == AN.MIBs.size()); + assert( + (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) || + AN.ContextSizeInfos.size() == AN.MIBs.size()); // Now add all of the MIBs and their stack nodes. for (auto &MIB : AN.MIBs) { CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> StackContext(&MIB); std::vector<ContextTotalSize> ContextSizeInfo; - if (MemProfReportHintedSizes) { + if (!AN.ContextSizeInfos.empty()) { for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I]) ContextSizeInfo.push_back({FullStackId, TotalSize}); } @@ -2784,9 +2778,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( OS << "\t\t" << *Edge << "\n"; if (!Clones.empty()) { OS << "\tClones: "; - FieldSeparator FS; + ListSeparator LS; for (auto *Clone : Clones) - OS << FS << Clone; + OS << LS << Clone; OS << "\n"; } else if (CloneOf) { OS << "\tClone of " << CloneOf << "\n"; @@ -2840,6 +2834,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes( if (!Node->IsAllocation) continue; DenseSet<uint32_t> ContextIds = Node->getContextIds(); + auto AllocTypeFromCall = getAllocationCallType(Node->Call); std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); std::sort(SortedIds.begin(), SortedIds.end()); for (auto Id : SortedIds) { @@ -2852,7 +2847,11 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes( << getAllocTypeString((uint8_t)TypeI->second) << " full allocation context " << Info.FullStackId << " with total size " << Info.TotalSize << " is " - << getAllocTypeString(Node->AllocTypes) << " after cloning\n"; + << getAllocTypeString(Node->AllocTypes) << " after cloning"; + if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall) + OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall) + << " due to cold byte percent"; + OS << "\n"; } } } @@ -3384,6 +3383,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) break; + // If the caller was not successfully matched to a call in the IR/summary, + // there is no point in trying to clone for it as we can't update that call. + if (!CallerEdge->Caller->hasCall()) { + ++EI; + continue; + } + // Only need to process the ids along this edge pertaining to the given // allocation. auto CallerEdgeContextsForAlloc = @@ -3495,6 +3501,23 @@ void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; } +AllocationType +ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { + const auto *CB = cast<CallBase>(Call.call()); + if (!CB->getAttributes().hasFnAttr("memprof")) + return AllocationType::None; + return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" + ? AllocationType::Cold + : AllocationType::NotCold; +} + +AllocationType +IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { + const auto *AI = Call.call().dyn_cast<AllocInfo *>(); + assert(AI->Versions.size() > Call.cloneNo()); + return (AllocationType)AI->Versions[Call.cloneNo()]; +} + void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { if (CalleeFunc.cloneNo() > 0) @@ -4025,6 +4048,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { } } + uint8_t BothTypes = + (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; + auto UpdateCalls = [&](ContextNode *Node, DenseSet<const ContextNode *> &Visited, auto &&UpdateCalls) { @@ -4044,7 +4070,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { return; if (Node->IsAllocation) { - updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); + auto AT = allocTypeToUse(Node->AllocTypes); + // If the allocation type is ambiguous, and more aggressive hinting + // has been enabled via the MinClonedColdBytePercent flag, see if this + // allocation should be hinted cold anyway because its fraction cold bytes + // allocated is at least the given threshold. + if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 && + !ContextIdToContextSizeInfos.empty()) { + uint64_t TotalCold = 0; + uint64_t Total = 0; + for (auto Id : Node->getContextIds()) { + auto TypeI = ContextIdToAllocationType.find(Id); + assert(TypeI != ContextIdToAllocationType.end()); + auto CSI = ContextIdToContextSizeInfos.find(Id); + if (CSI != ContextIdToContextSizeInfos.end()) { + for (auto &Info : CSI->second) { + Total += Info.TotalSize; + if (TypeI->second == AllocationType::Cold) + TotalCold += Info.TotalSize; + } + } + } + if (TotalCold * 100 >= Total * MinClonedColdBytePercent) + AT = AllocationType::Cold; + } + updateAllocationCall(Node->Call, AT); assert(Node->MatchingCalls.empty()); return; } @@ -4427,7 +4477,11 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // will still be none type or should have gotten the default NotCold. // Skip that after calling clone helper since that does some sanity // checks that confirm we haven't decided yet that we need cloning. - if (AllocNode.Versions.size() == 1) { + // We might have a single version that is cold due to the + // MinClonedColdBytePercent heuristic, make sure we don't skip in that + // case. + if (AllocNode.Versions.size() == 1 && + (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) { assert((AllocationType)AllocNode.Versions[0] == AllocationType::NotCold || (AllocationType)AllocNode.Versions[0] == diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index b2fa66f2a6d3..603beb3b883d 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -529,7 +529,7 @@ protected: void generateMDProfMetadata(Function &F); bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI, const SampleProfileMap &Profiles); - void removePseudoProbeInsts(Module &M); + void removePseudoProbeInstsDiscriminator(Module &M); /// Map from function name to Function *. Used to find the function from /// the function name. If the function name contains suffix, additional @@ -2138,13 +2138,25 @@ bool SampleProfileLoader::rejectHighStalenessProfile( return false; } -void SampleProfileLoader::removePseudoProbeInsts(Module &M) { +void SampleProfileLoader::removePseudoProbeInstsDiscriminator(Module &M) { for (auto &F : M) { std::vector<Instruction *> InstsToDel; for (auto &BB : F) { for (auto &I : BB) { if (isa<PseudoProbeInst>(&I)) InstsToDel.push_back(&I); + else if (isa<CallBase>(&I)) + if (const DILocation *DIL = I.getDebugLoc().get()) { + // Restore dwarf discriminator for call. + unsigned Discriminator = DIL->getDiscriminator(); + if (DILocation::isPseudoProbeDiscriminator(Discriminator)) { + std::optional<uint32_t> DwarfDiscriminator = + PseudoProbeDwarfDiscriminator::extractDwarfBaseDiscriminator( + Discriminator); + I.setDebugLoc(DIL->cloneWithDiscriminator( + DwarfDiscriminator ? *DwarfDiscriminator : 0)); + } + } } } for (auto *I : InstsToDel) @@ -2224,8 +2236,12 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, notInlinedCallInfo) updateProfileCallee(pair.first, pair.second.entryCount); - if (RemoveProbeAfterProfileAnnotation && FunctionSamples::ProfileIsProbeBased) - removePseudoProbeInsts(M); + if (RemoveProbeAfterProfileAnnotation && + FunctionSamples::ProfileIsProbeBased) { + removePseudoProbeInstsDiscriminator(M); + if (auto *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName)) + M.eraseNamedMetadata(FuncInfo); + } return retval; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index ea7942ef9781..7a184a19d7c5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1289,7 +1289,7 @@ static Instruction *foldAddToAshr(BinaryOperator &Add) { // Note that, by the time we end up here, if possible, ugt has been // canonicalized into eq. const APInt *MaskC, *MaskCCmp; - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(Add.getOperand(1), m_SExt(m_ICmp(Pred, m_And(m_Specific(X), m_APInt(MaskC)), m_APInt(MaskCCmp))))) @@ -1382,7 +1382,7 @@ Instruction *InstCombinerImpl:: // `select` itself may be appropriately extended, look past that. SkipExtInMagic(Select); - ICmpInst::Predicate Pred; + CmpPredicate Pred; const APInt *Thr; Value *SignExtendingValue, *Zero; bool ShouldSignext; @@ -1654,7 +1654,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { return replaceInstUsesWith(I, Constant::getNullValue(I.getType())); // sext(A < B) + zext(A > B) => ucmp/scmp(A, B) - ICmpInst::Predicate LTPred, GTPred; + CmpPredicate LTPred, GTPred; if (match(&I, m_c_Add(m_SExt(m_c_ICmp(LTPred, m_Value(A), m_Value(B))), m_ZExt(m_c_ICmp(GTPred, m_Deferred(A), m_Deferred(B))))) && @@ -1841,7 +1841,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { // --> // BW - ctlz(A - 1, false) const APInt *XorC; - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (match(&I, m_c_Add( m_ZExt(m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(A)), @@ -2280,6 +2280,16 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes())))) return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X); + // if (C1 & C2) == C2 then (X & C1) - (X & C2) -> X & (C1 ^ C2) + Constant *C1, *C2; + if (match(Op0, m_And(m_Value(X), m_ImmConstant(C1))) && + match(Op1, m_And(m_Specific(X), m_ImmConstant(C2)))) { + Value *AndC = ConstantFoldBinaryInstruction(Instruction::And, C1, C2); + if (C2->isElementWiseEqual(AndC)) + return BinaryOperator::CreateAnd( + X, ConstantFoldBinaryInstruction(Instruction::Xor, C1, C2)); + } + // Reassociate sub/add sequences to create more add instructions and // reduce dependency chains: // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index b4033fc2a418..e576eea4ca36 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -455,14 +455,20 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed( // RHS. For example, // (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). - if (IsSuperSetOrEqual(BCst, DCst)) + if (IsSuperSetOrEqual(BCst, DCst)) { + // We can't guarantee that samesign hold after this fold. + RHS->setSameSign(false); return RHS; + } // Otherwise, B is a subset of D. If B and E have a common bit set, // ie. (B & E) != 0, then LHS is subsumed by RHS. For example. // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code"); - if ((*BCst & ECst) != 0) + if ((*BCst & ECst) != 0) { + // We can't guarantee that samesign hold after this fold. + RHS->setSameSign(false); return RHS; + } // Otherwise, LHS and RHS contradict and the whole expression becomes false // (or true if negated.) For example, // (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false. @@ -695,13 +701,17 @@ Value *InstCombinerImpl::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, Cmp1->getPredicate()); Value *Input = Cmp0->getOperand(0); + Value *Cmp1Op0 = Cmp1->getOperand(0); + Value *Cmp1Op1 = Cmp1->getOperand(1); Value *RangeEnd; - if (Cmp1->getOperand(0) == Input) { + if (match(Cmp1Op0, m_SExtOrSelf(m_Specific(Input)))) { // For the upper range compare we have: icmp x, n - RangeEnd = Cmp1->getOperand(1); - } else if (Cmp1->getOperand(1) == Input) { + Input = Cmp1Op0; + RangeEnd = Cmp1Op1; + } else if (match(Cmp1Op1, m_SExtOrSelf(m_Specific(Input)))) { // For the upper range compare we have: icmp n, x - RangeEnd = Cmp1->getOperand(0); + Input = Cmp1Op1; + RangeEnd = Cmp1Op0; Pred1 = ICmpInst::getSwappedPredicate(Pred1); } else { return nullptr; @@ -734,7 +744,7 @@ static Value * foldAndOrOfICmpsWithPow2AndWithZero(InstCombiner::BuilderTy &Builder, ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, const SimplifyQuery &Q) { - CmpInst::Predicate Pred = IsAnd ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; + CmpPredicate Pred = IsAnd ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; // Make sure we have right compares for our op. if (LHS->getPredicate() != Pred || RHS->getPredicate() != Pred) return nullptr; @@ -871,7 +881,7 @@ static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1, // Try to match/decompose into: icmp eq (X & Mask), 0 auto tryToDecompose = [](ICmpInst *ICmp, Value *&X, APInt &UnsetBitsMask) -> bool { - CmpInst::Predicate Pred = ICmp->getPredicate(); + CmpPredicate Pred = ICmp->getPredicate(); // Can it be decomposed into icmp eq (X & Mask), 0 ? auto Res = llvm::decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1), @@ -940,7 +950,7 @@ static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1, static Value *foldIsPowerOf2OrZero(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd, InstCombiner::BuilderTy &Builder, InstCombinerImpl &IC) { - CmpInst::Predicate Pred0, Pred1; + CmpPredicate Pred0, Pred1; Value *X; if (!match(Cmp0, m_ICmp(Pred0, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)), m_SpecificInt(1))) || @@ -1113,12 +1123,12 @@ static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp, const SimplifyQuery &Q, InstCombiner::BuilderTy &Builder) { Value *ZeroCmpOp; - ICmpInst::Predicate EqPred; + CmpPredicate EqPred; if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) || !ICmpInst::isEquality(EqPred)) return nullptr; - ICmpInst::Predicate UnsignedPred; + CmpPredicate UnsignedPred; Value *A, *B; if (match(UnsignedICmp, @@ -1277,7 +1287,7 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1, const SimplifyQuery &Q) { // Match an equality compare with a non-poison constant as Cmp0. // Also, give up if the compare can be constant-folded to avoid looping. - ICmpInst::Predicate Pred0; + CmpPredicate Pred0; Value *X; Constant *C; if (!match(Cmp0, m_ICmp(Pred0, m_Value(X), m_Constant(C))) || @@ -1291,7 +1301,7 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1, // common operand as operand 1 (Pred1 is swapped if the common operand was // operand 0). Value *Y; - ICmpInst::Predicate Pred1; + CmpPredicate Pred1; if (!match(Cmp1, m_c_ICmp(Pred1, m_Value(Y), m_Specific(X)))) return nullptr; @@ -1322,7 +1332,7 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1, Value *InstCombinerImpl::foldAndOrOfICmpsUsingRanges(ICmpInst *ICmp1, ICmpInst *ICmp2, bool IsAnd) { - ICmpInst::Predicate Pred1, Pred2; + CmpPredicate Pred1, Pred2; Value *V1, *V2; const APInt *C1, *C2; if (!match(ICmp1, m_ICmp(Pred1, m_Value(V1), m_APInt(C1))) || @@ -1344,12 +1354,12 @@ Value *InstCombinerImpl::foldAndOrOfICmpsUsingRanges(ICmpInst *ICmp1, return nullptr; ConstantRange CR1 = ConstantRange::makeExactICmpRegion( - IsAnd ? ICmpInst::getInversePredicate(Pred1) : Pred1, *C1); + IsAnd ? ICmpInst::getInverseCmpPredicate(Pred1) : Pred1, *C1); if (Offset1) CR1 = CR1.subtract(*Offset1); ConstantRange CR2 = ConstantRange::makeExactICmpRegion( - IsAnd ? ICmpInst::getInversePredicate(Pred2) : Pred2, *C2); + IsAnd ? ICmpInst::getInverseCmpPredicate(Pred2) : Pred2, *C2); if (Offset2) CR2 = CR2.subtract(*Offset2); @@ -3939,7 +3949,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) return V; - CmpInst::Predicate Pred; + CmpPredicate Pred; Value *Mul, *Ov, *MulIsNotZero, *UMulWithOv; // Check if the OR weakens the overflow condition for umul.with.overflow by // treating any non-zero result as overflow. In that case, we overflow if both @@ -4604,7 +4614,7 @@ Instruction *InstCombinerImpl::foldNot(BinaryOperator &I) { } // not (cmp A, B) = !cmp A, B - CmpInst::Predicate Pred; + CmpPredicate Pred; if (match(NotOp, m_Cmp(Pred, m_Value(), m_Value())) && (NotOp->hasOneUse() || InstCombiner::canFreelyInvertAllUsersOf(cast<Instruction>(NotOp), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 7221c987b982..0b9379965f42 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -934,6 +934,11 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { } } + if (DestWidth == 1 && + (Trunc.hasNoUnsignedWrap() || Trunc.hasNoSignedWrap()) && + isKnownNonZero(Src, SQ.getWithInstruction(&Trunc))) + return replaceInstUsesWith(Trunc, ConstantInt::getTrue(DestTy)); + bool Changed = false; if (!Trunc.hasNoSignedWrap() && ComputeMaxSignificantBits(Src, /*Depth=*/0, &Trunc) <= DestWidth) { @@ -1847,15 +1852,16 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { Value *X; Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0)); if (Op && Op->hasOneUse()) { - // FIXME: The FMF should propagate from the fptrunc, not the source op. IRBuilder<>::FastMathFlagGuard FMFG(Builder); - if (isa<FPMathOperator>(Op)) - Builder.setFastMathFlags(Op->getFastMathFlags()); + FastMathFlags FMF = FPT.getFastMathFlags(); + if (auto *FPMO = dyn_cast<FPMathOperator>(Op)) + FMF &= FPMO->getFastMathFlags(); + Builder.setFastMathFlags(FMF); if (match(Op, m_FNeg(m_Value(X)))) { Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty); - - return UnaryOperator::CreateFNegFMF(InnerTrunc, Op); + Value *Neg = Builder.CreateFNeg(InnerTrunc); + return replaceInstUsesWith(FPT, Neg); } // If we are truncating a select that has an extended operand, we can @@ -2106,10 +2112,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { Base->getType() == Ty) { Value *Offset = EmitGEPOffset(GEP); auto *NewOp = BinaryOperator::CreateAdd(Base, Offset); - if (GEP->hasNoUnsignedWrap() || - (GEP->hasNoUnsignedSignedWrap() && - isKnownNonNegative(Offset, SQ.getWithInstruction(&CI)))) - NewOp->setHasNoUnsignedWrap(true); + NewOp->setHasNoUnsignedWrap(GEP->hasNoUnsignedWrap()); return NewOp; } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 56391d320e8b..d6fdade25559 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -689,13 +689,32 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, if (!isa<GetElementPtrInst>(RHS)) RHS = RHS->stripPointerCasts(); + auto CanFold = [Cond](GEPNoWrapFlags NW) { + if (ICmpInst::isEquality(Cond)) + return true; + + // Unsigned predicates can be folded if the GEPs have *any* nowrap flags. + assert(ICmpInst::isUnsigned(Cond)); + return NW != GEPNoWrapFlags::none(); + }; + + auto NewICmp = [Cond](GEPNoWrapFlags NW, Value *Op1, Value *Op2) { + if (!NW.hasNoUnsignedWrap()) { + // Convert signed to unsigned comparison. + return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Op1, Op2); + } + + auto *I = new ICmpInst(Cond, Op1, Op2); + I->setSameSign(NW.hasNoUnsignedSignedWrap()); + return I; + }; + Value *PtrBase = GEPLHS->getOperand(0); - if (PtrBase == RHS && - (GEPLHS->hasNoUnsignedSignedWrap() || ICmpInst::isEquality(Cond))) { + if (PtrBase == RHS && CanFold(GEPLHS->getNoWrapFlags())) { // ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0). Value *Offset = EmitGEPOffset(GEPLHS); - return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset, - Constant::getNullValue(Offset->getType())); + return NewICmp(GEPLHS->getNoWrapFlags(), Offset, + Constant::getNullValue(Offset->getType())); } if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) && @@ -813,19 +832,18 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, return replaceInstUsesWith(I, // No comparison is needed here. ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond))); - else if (NumDifferences == 1 && NW.hasNoUnsignedSignedWrap()) { + else if (NumDifferences == 1 && CanFold(NW)) { Value *LHSV = GEPLHS->getOperand(DiffOperand); Value *RHSV = GEPRHS->getOperand(DiffOperand); - // Make sure we do a signed comparison here. - return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV); + return NewICmp(NW, LHSV, RHSV); } } - if (NW.hasNoUnsignedSignedWrap() || CmpInst::isEquality(Cond)) { + if (CanFold(NW)) { // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2) Value *L = EmitGEPOffset(GEPLHS, /*RewriteGEP=*/true); Value *R = EmitGEPOffset(GEPRHS, /*RewriteGEP=*/true); - return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R); + return NewICmp(NW, L, R); } } @@ -1155,7 +1173,7 @@ Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) { // This fold is only valid for equality predicates. if (!I.isEquality()) return nullptr; - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *X, *Y, *Zero; if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))), m_CombineAnd(m_Zero(), m_Value(Zero))))) @@ -1172,7 +1190,7 @@ Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) { /// by one-less-than-bitwidth into a sign test on the original value. Instruction *InstCombinerImpl::foldSignBitTest(ICmpInst &I) { Instruction *Val; - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero()))) return nullptr; @@ -1386,7 +1404,7 @@ Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) { }; for (BranchInst *BI : DC.conditionsFor(X)) { - ICmpInst::Predicate DomPred; + CmpPredicate DomPred; const APInt *DomC; if (!match(BI->getCondition(), m_ICmp(DomPred, m_Specific(X), m_APInt(DomC)))) @@ -1499,7 +1517,7 @@ Instruction * InstCombinerImpl::foldICmpTruncWithTruncOrExt(ICmpInst &Cmp, const SimplifyQuery &Q) { Value *X, *Y; - ICmpInst::Predicate Pred; + CmpPredicate Pred; bool YIsSExt = false; // Try to match icmp (trunc X), (trunc Y) if (match(&Cmp, m_ICmp(Pred, m_Trunc(m_Value(X)), m_Trunc(m_Value(Y))))) { @@ -3231,7 +3249,7 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, // i32 Equal, // i32 (select i1 (a < b), i32 Less, i32 Greater) // where Equal, Less and Greater are placeholders for any three constants. - ICmpInst::Predicate PredA; + CmpPredicate PredA; if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) || !ICmpInst::isEquality(PredA)) return false; @@ -3242,7 +3260,7 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, std::swap(EqualVal, UnequalVal); if (!match(EqualVal, m_ConstantInt(Equal))) return false; - ICmpInst::Predicate PredB; + CmpPredicate PredB; Value *LHS2, *RHS2; if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)), m_ConstantInt(Less), m_ConstantInt(Greater)))) @@ -3604,7 +3622,8 @@ Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant( m_OneUse(m_c_Or(m_CombineAnd(m_Value(Sel), m_Select(m_Value(Cond), m_Value(TV), m_Value(FV))), - m_Value(Other))))) { + m_Value(Other)))) && + Cond->getType() == Cmp.getType()) { const SimplifyQuery Q = SQ.getWithInstruction(&Cmp); // Easy case is if eq/ne matches whether 0 is trueval/falseval. if (Pred == ICmpInst::ICMP_EQ @@ -4546,7 +4565,7 @@ static Value *foldICmpWithLowBitMaskedVal(CmpPredicate Pred, Value *Op0, static Value * foldICmpWithTruncSignExtendedVal(ICmpInst &I, InstCombiner::BuilderTy &Builder) { - ICmpInst::Predicate SrcPred; + CmpPredicate SrcPred; Value *X; const APInt *C0, *C1; // FIXME: non-splats, potentially with undef. // We are ok with 'shl' having multiple uses, but 'ashr' must be one-use. @@ -4792,7 +4811,7 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ, /// Note that the comparison is commutative, while inverted (u>=, ==) predicate /// will mean that we are looking for the opposite answer. Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) { - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *X, *Y; Instruction *Mul; Instruction *Div; @@ -4862,7 +4881,7 @@ Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) { static Instruction *foldICmpXNegX(ICmpInst &I, InstCombiner::BuilderTy &Builder) { - CmpInst::Predicate Pred; + CmpPredicate Pred; Value *X; if (match(&I, m_c_ICmp(Pred, m_NSWNeg(m_Value(X)), m_Deferred(X)))) { @@ -5347,6 +5366,15 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I, return new ICmpInst(Pred, X, Y); if (ZKnown.isNegative()) return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), X, Y); + Value *LessThan = simplifyICmpInst(ICmpInst::ICMP_SLT, X, Y, + SQ.getWithInstruction(&I)); + if (LessThan && match(LessThan, m_One())) + return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Z, + Constant::getNullValue(Z->getType())); + Value *GreaterThan = simplifyICmpInst(ICmpInst::ICMP_SGT, X, Y, + SQ.getWithInstruction(&I)); + if (GreaterThan && match(GreaterThan, m_One())) + return new ICmpInst(Pred, Z, Constant::getNullValue(Z->getType())); } } else { bool NonZero; @@ -6794,7 +6822,7 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) { /// then try to reduce patterns based on that limit. Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) { Value *X, *Y; - ICmpInst::Predicate Pred; + CmpPredicate Pred; // X must be 0 and bool must be true for "ULT": // X <u (zext i1 Y) --> (X == 0) & Y @@ -6809,7 +6837,7 @@ Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) { return BinaryOperator::CreateOr(Builder.CreateIsNull(X), Y); // icmp eq/ne X, (zext/sext (icmp eq/ne X, C)) - ICmpInst::Predicate Pred1, Pred2; + CmpPredicate Pred1, Pred2; const APInt *C; Instruction *ExtI; if (match(&I, m_c_ICmp(Pred1, m_Value(X), @@ -7079,7 +7107,7 @@ static Instruction *canonicalizeICmpBool(ICmpInst &I, // (X l>> Y) == 0 static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp, InstCombiner::BuilderTy &Builder) { - ICmpInst::Predicate Pred, NewPred; + CmpPredicate Pred, NewPred; Value *X, *Y; if (match(&Cmp, m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) { @@ -7244,7 +7272,7 @@ static Instruction *foldReductionIdiom(ICmpInst &I, const DataLayout &DL) { if (I.getType()->isVectorTy()) return nullptr; - ICmpInst::Predicate OuterPred, InnerPred; + CmpPredicate OuterPred, InnerPred; Value *LHS, *RHS; // Match lowering of @llvm.vector.reduce.and. Turn diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 28474fec8238..3a074ee70dc4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -600,7 +600,8 @@ public: /// Given a binary operator, cast instruction, or select which has a PHI node /// as operand #0, see if we can fold the instruction into the PHI (which is /// only possible if all operands to the PHI are constants). - Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN); + Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN, + bool AllowMultipleUses = false); /// For a binary operator with 2 phi operands, try to hoist the binary /// operation before the phi. This can result in fewer instructions in diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 1fcf1c570add..272a1942c335 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -788,6 +788,9 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { BasicBlock *BB = std::get<0>(Incoming); Value *V = std::get<1>(Incoming); LoadInst *LI = cast<LoadInst>(V); + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Call combineMetadataForCSE instead, so that an explicit set of KnownIDs + // doesn't need to be maintained here. combineMetadata(NewLI, LI, KnownIDs, true); Value *NewInVal = LI->getOperand(0); if (NewInVal != InVal) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index c7a0c35d099c..e7a8e947705f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -58,7 +58,7 @@ static Instruction *foldSelectBinOpIdentity(SelectInst &Sel, // The select condition must be an equality compare with a constant operand. Value *X; Constant *C; - CmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(Sel.getCondition(), m_Cmp(Pred, m_Value(X), m_Constant(C)))) return nullptr; @@ -425,17 +425,19 @@ Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI, // icmp with a common operand also can have the common operand // pulled after the select. - ICmpInst::Predicate TPred, FPred; + CmpPredicate TPred, FPred; if (match(TI, m_ICmp(TPred, m_Value(), m_Value())) && match(FI, m_ICmp(FPred, m_Value(), m_Value()))) { - if (TPred == FPred || TPred == CmpInst::getSwappedPredicate(FPred)) { - bool Swapped = TPred != FPred; + // FIXME: Use CmpPredicate::getMatching here. + CmpInst::Predicate T = TPred, F = FPred; + if (T == F || T == ICmpInst::getSwappedCmpPredicate(F)) { + bool Swapped = T != F; if (Value *MatchOp = getCommonOp(TI, FI, ICmpInst::isEquality(TPred), Swapped)) { Value *NewSel = Builder.CreateSelect(Cond, OtherOpT, OtherOpF, SI.getName() + ".v", &SI); return new ICmpInst( - MatchIsOpZero ? TPred : CmpInst::getSwappedPredicate(TPred), + MatchIsOpZero ? TPred : ICmpInst::getSwappedCmpPredicate(TPred), MatchOp, NewSel); } } @@ -640,7 +642,7 @@ static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp, static Value *foldSelectICmpAndZeroShl(const ICmpInst *Cmp, Value *TVal, Value *FVal, InstCombiner::BuilderTy &Builder) { - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *AndVal; if (!match(Cmp, m_ICmp(Pred, m_Value(AndVal), m_Zero()))) return nullptr; @@ -867,7 +869,7 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) { auto *TrueVal = SI.getTrueValue(); auto *FalseVal = SI.getFalseValue(); Value *X, *Y; - ICmpInst::Predicate Predicate; + CmpPredicate Predicate; // Assuming that constant compared with zero is not undef (but it may be // a vector with some undef elements). Otherwise (when a constant is undef) @@ -1527,7 +1529,7 @@ static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0, return nullptr; Value *Cmp1; - ICmpInst::Predicate Pred1; + CmpPredicate Pred1; Constant *C2; Value *ReplacementLow, *ReplacementHigh; if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow), @@ -1636,7 +1638,7 @@ static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0, static Instruction * tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, InstCombinerImpl &IC) { - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *X; Constant *C0; if (!match(&Cmp, m_OneUse(m_ICmp( @@ -1734,7 +1736,7 @@ static Value *foldSelectInstWithICmpConst(SelectInst &SI, ICmpInst *ICI, InstCombiner::BuilderTy &Builder) { const APInt *CmpC; Value *V; - CmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(ICI, m_ICmp(Pred, m_Value(V), m_APInt(CmpC)))) return nullptr; @@ -1779,6 +1781,46 @@ static Value *foldSelectInstWithICmpConst(SelectInst &SI, ICmpInst *ICI, return nullptr; } +/// `A == MIN_INT ? B != MIN_INT : A < B` --> `A < B` +/// `A == MAX_INT ? B != MAX_INT : A > B` --> `A > B` +static Instruction *foldSelectWithExtremeEqCond(Value *CmpLHS, Value *CmpRHS, + Value *TrueVal, + Value *FalseVal) { + Type *Ty = CmpLHS->getType(); + + if (Ty->isPtrOrPtrVectorTy()) + return nullptr; + + CmpPredicate Pred; + Value *B; + + if (!match(FalseVal, m_c_ICmp(Pred, m_Specific(CmpLHS), m_Value(B)))) + return nullptr; + + Value *TValRHS; + if (!match(TrueVal, m_SpecificICmp(ICmpInst::ICMP_NE, m_Specific(B), + m_Value(TValRHS)))) + return nullptr; + + APInt C; + unsigned BitWidth = Ty->getScalarSizeInBits(); + + if (ICmpInst::isLT(Pred)) { + C = CmpInst::isSigned(Pred) ? APInt::getSignedMinValue(BitWidth) + : APInt::getMinValue(BitWidth); + } else if (ICmpInst::isGT(Pred)) { + C = CmpInst::isSigned(Pred) ? APInt::getSignedMaxValue(BitWidth) + : APInt::getMaxValue(BitWidth); + } else { + return nullptr; + } + + if (!match(CmpRHS, m_SpecificInt(C)) || !match(TValRHS, m_SpecificInt(C))) + return nullptr; + + return new ICmpInst(Pred, CmpLHS, B); +} + static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI, InstCombinerImpl &IC) { ICmpInst::Predicate Pred = ICI->getPredicate(); @@ -1793,6 +1835,10 @@ static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI, if (Pred == ICmpInst::ICMP_NE) std::swap(TrueVal, FalseVal); + if (Instruction *Res = + foldSelectWithExtremeEqCond(CmpLHS, CmpRHS, TrueVal, FalseVal)) + return Res; + // Transform (X == C) ? X : Y -> (X == C) ? C : Y // specific handling for Bitwise operation. // x&y -> (x|y) ^ (x^y) or (x|y) & ~(x^y) @@ -1890,7 +1936,7 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal, BinaryOperator *BOp; Constant *C1, *C2, *C3; Value *X; - ICmpInst::Predicate Predicate; + CmpPredicate Predicate; if (!match(Cmp, m_ICmp(Predicate, m_Value(X), m_Constant(C1)))) return nullptr; @@ -2138,7 +2184,7 @@ foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) { auto IsSignedSaturateLimit = [&](Value *Limit, bool IsAdd) { Type *Ty = Limit->getType(); - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *TrueVal, *FalseVal, *Op; const APInt *C; if (!match(Limit, m_Select(m_ICmp(Pred, m_Value(Op), m_APInt(C)), @@ -2347,7 +2393,7 @@ static Instruction *foldSelectCmpBitcasts(SelectInst &Sel, Value *TVal = Sel.getTrueValue(); Value *FVal = Sel.getFalseValue(); - CmpInst::Predicate Pred; + CmpPredicate Pred; Value *A, *B; if (!match(Cond, m_Cmp(Pred, m_Value(A), m_Value(B)))) return nullptr; @@ -2552,7 +2598,7 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel, Value *X; const APInt *C; bool IsTrueIfSignSet; - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(Cond, m_OneUse(m_ICmp(Pred, m_ElementWiseBitCast(m_Value(X)), m_APInt(C)))) || !isSignBitCheck(Pred, *C, IsTrueIfSignSet) || X->getType() != SelType) @@ -2748,7 +2794,7 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC, Value *TrueVal = SI.getTrueValue(); Value *FalseVal = SI.getFalseValue(); - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *Op, *RemRes, *Remainder; const APInt *C; bool TrueIfSigned = false; @@ -2807,7 +2853,7 @@ static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy // a = select c, x, y ; // f(a, c) ; f(poison, 1) cannot happen, but if a is folded // ; to y, this can happen. - CmpInst::Predicate Pred; + CmpPredicate Pred; if (FI->hasOneUse() && match(Cond, m_c_ICmp(Pred, m_Specific(TrueVal), m_Specific(FalseVal))) && (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) { @@ -2856,7 +2902,7 @@ static Instruction *foldSelectWithFCmpToFabs(SelectInst &SI, for (bool Swap : {false, true}) { Value *TrueVal = SI.getTrueValue(); Value *X = SI.getFalseValue(); - CmpInst::Predicate Pred; + CmpPredicate Pred; if (Swap) std::swap(TrueVal, X); @@ -2936,7 +2982,7 @@ static Instruction *foldSelectWithFCmpToFabs(SelectInst &SI, if (Swap) std::swap(TrueVal, X); - CmpInst::Predicate Pred; + CmpPredicate Pred; const APInt *C; bool TrueIfSigned; if (!match(CondVal, @@ -2980,7 +3026,7 @@ foldRoundUpIntegerWithPow2Alignment(SelectInst &SI, Value *X = SI.getTrueValue(); Value *XBiasedHighBits = SI.getFalseValue(); - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *XLowBits; if (!match(Cond, m_ICmp(Pred, m_Value(XLowBits), m_ZeroInt())) || !ICmpInst::isEquality(Pred)) @@ -3159,7 +3205,7 @@ static bool impliesPoisonOrCond(const Value *ValAssumedPoison, const Value *V, Value *LHS = ICmp->getOperand(0); const APInt *RHSC1; const APInt *RHSC2; - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (ICmp->hasSameSign() && match(ICmp->getOperand(1), m_APIntForbidPoison(RHSC1)) && match(V, m_ICmp(Pred, m_Specific(LHS), m_APIntAllowPoison(RHSC2)))) { @@ -3170,7 +3216,7 @@ static bool impliesPoisonOrCond(const Value *ValAssumedPoison, const Value *V, APInt::getZero(BitWidth)) : ConstantRange(APInt::getZero(BitWidth), APInt::getSignedMinValue(BitWidth)); - return CRX.icmp(Expected ? Pred : ICmpInst::getInversePredicate(Pred), + return CRX.icmp(Expected ? Pred : ICmpInst::getInverseCmpPredicate(Pred), *RHSC2); } } @@ -3539,7 +3585,7 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder, Value *FalseVal = SI.getFalseValue(); Value *TrueVal = SI.getTrueValue(); - ICmpInst::Predicate Pred; + CmpPredicate Pred; const APInt *Cond1; Value *Cond0, *Ctlz, *CtlzOp; if (!match(SI.getCondition(), m_ICmp(Pred, m_Value(Cond0), m_APInt(Cond1)))) @@ -3590,7 +3636,7 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { Value *TV = SI.getTrueValue(); Value *FV = SI.getFalseValue(); - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *LHS, *RHS; if (!match(SI.getCondition(), m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) return nullptr; @@ -3610,7 +3656,7 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { bool IsSigned = ICmpInst::isSigned(Pred); bool Replace = false; - ICmpInst::Predicate ExtendedCmpPredicate; + CmpPredicate ExtendedCmpPredicate; // (x < y) ? -1 : zext(x != y) // (x < y) ? -1 : zext(x > y) if (ICmpInst::isLT(Pred) && match(TV, m_AllOnes()) && @@ -3630,7 +3676,7 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { Replace = true; // (x == y) ? 0 : (x > y ? 1 : -1) - ICmpInst::Predicate FalseBranchSelectPredicate; + CmpPredicate FalseBranchSelectPredicate; const APInt *InnerTV, *InnerFV; if (Pred == ICmpInst::ICMP_EQ && match(TV, m_Zero()) && match(FV, m_Select(m_c_ICmp(FalseBranchSelectPredicate, m_Specific(LHS), @@ -3723,22 +3769,9 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI, if (!SIFOp || !SIFOp->hasNoSignedZeros() || !SIFOp->hasNoNaNs()) return nullptr; - // select((fcmp Pred, X, 0), (fadd X, C), C) - // => fadd((select (fcmp Pred, X, 0), X, 0), C) - // - // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE - Instruction *FAdd; - Constant *C; - Value *X, *Z; - CmpInst::Predicate Pred; - - // Note: OneUse check for `Cmp` is necessary because it makes sure that other - // InstCombine folds don't undo this transformation and cause an infinite - // loop. Furthermore, it could also increase the operation count. - if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), - m_OneUse(m_Instruction(FAdd)), m_Constant(C))) || - match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), - m_Constant(C), m_OneUse(m_Instruction(FAdd))))) { + auto TryFoldIntoAddConstant = + [&Builder, &SI](CmpInst::Predicate Pred, Value *X, Value *Z, + Instruction *FAdd, Constant *C, bool Swapped) -> Value * { // Only these relational predicates can be transformed into maxnum/minnum // intrinsic. if (!CmpInst::isRelational(Pred) || !match(Z, m_AnyZeroFP())) @@ -3747,7 +3780,8 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI, if (!match(FAdd, m_FAdd(m_Specific(X), m_Specific(C)))) return nullptr; - Value *NewSelect = Builder.CreateSelect(SI.getCondition(), X, Z, "", &SI); + Value *NewSelect = Builder.CreateSelect(SI.getCondition(), Swapped ? Z : X, + Swapped ? X : Z, "", &SI); NewSelect->takeName(&SI); Value *NewFAdd = Builder.CreateFAdd(NewSelect, C); @@ -3762,7 +3796,27 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI, cast<Instruction>(NewSelect)->setFastMathFlags(NewFMF); return NewFAdd; - } + }; + + // select((fcmp Pred, X, 0), (fadd X, C), C) + // => fadd((select (fcmp Pred, X, 0), X, 0), C) + // + // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE + Instruction *FAdd; + Constant *C; + Value *X, *Z; + CmpPredicate Pred; + + // Note: OneUse check for `Cmp` is necessary because it makes sure that other + // InstCombine folds don't undo this transformation and cause an infinite + // loop. Furthermore, it could also increase the operation count. + if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), + m_OneUse(m_Instruction(FAdd)), m_Constant(C)))) + return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/false); + + if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), + m_Constant(C), m_OneUse(m_Instruction(FAdd))))) + return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/true); return nullptr; } @@ -3798,6 +3852,12 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { ConstantInt::getFalse(CondType), SQ, /* AllowRefinement */ true)) return replaceOperand(SI, 2, S); + + if (replaceInInstruction(TrueVal, CondVal, + ConstantInt::getTrue(CondType)) || + replaceInInstruction(FalseVal, CondVal, + ConstantInt::getFalse(CondType))) + return &SI; } if (Instruction *R = foldSelectOfBools(SI)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 10c3ccdb2243..d511e79e3e48 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -427,7 +427,8 @@ Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) { if (Instruction *R = FoldOpIntoSelect(I, SI)) return R; - if (Constant *CUI = dyn_cast<Constant>(Op1)) + Constant *CUI; + if (match(Op1, m_ImmConstant(CUI))) if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I)) return Res; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 09eafd09451b..ce6154fd610e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -86,7 +86,7 @@ static bool cheapToScalarize(Value *V, Value *EI) { if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI)) return true; - CmpInst::Predicate UnusedPred; + CmpPredicate UnusedPred; if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1))))) if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI)) return true; @@ -486,7 +486,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { } Value *X, *Y; - CmpInst::Predicate Pred; + CmpPredicate Pred; if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) && cheapToScalarize(SrcVec, Index)) { // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index) @@ -2978,7 +2978,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { } } if (auto *PN = dyn_cast<PHINode>(LHS)) { - if (Instruction *I = foldOpIntoPhi(SVI, PN)) + if (Instruction *I = foldOpIntoPhi(SVI, PN, /*AllowMultipleUses=*/true)) return I; } } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 3325a1868ebd..934156f04f7f 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1763,7 +1763,8 @@ static Value *simplifyInstructionWithPHI(Instruction &I, PHINode *PN, return nullptr; } -Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { +Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, + bool AllowMultipleUses) { unsigned NumPHIValues = PN->getNumIncomingValues(); if (NumPHIValues == 0) return nullptr; @@ -1771,7 +1772,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { // We normally only transform phis with a single use. However, if a PHI has // multiple uses and they are all the same operation, we can fold *all* of the // uses into the PHI. - if (!PN->hasOneUse()) { + bool OneUse = PN->hasOneUse(); + bool IdenticalUsers = false; + if (!AllowMultipleUses && !OneUse) { // Walk the use list for the instruction, comparing them to I. for (User *U : PN->users()) { Instruction *UI = cast<Instruction>(U); @@ -1779,6 +1782,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { return nullptr; } // Otherwise, we can replace *all* users with the new PHI we form. + IdenticalUsers = true; } // Check that all operands are phi-translatable. @@ -1829,6 +1833,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { continue; } + if (!OneUse && !IdenticalUsers) + return nullptr; + if (SeenNonSimplifiedInVal) return nullptr; // More than one non-simplified value. SeenNonSimplifiedInVal = true; @@ -1890,17 +1897,22 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { for (unsigned i = 0; i != NumPHIValues; ++i) NewPN->addIncoming(NewPhiValues[i], PN->getIncomingBlock(i)); - for (User *U : make_early_inc_range(PN->users())) { - Instruction *User = cast<Instruction>(U); - if (User == &I) - continue; - replaceInstUsesWith(*User, NewPN); - eraseInstFromFunction(*User); + if (IdenticalUsers) { + for (User *U : make_early_inc_range(PN->users())) { + Instruction *User = cast<Instruction>(U); + if (User == &I) + continue; + replaceInstUsesWith(*User, NewPN); + eraseInstFromFunction(*User); + } + OneUse = true; } - replaceAllDbgUsesWith(const_cast<PHINode &>(*PN), - const_cast<PHINode &>(*NewPN), - const_cast<PHINode &>(*PN), DT); + if (OneUse) { + replaceAllDbgUsesWith(const_cast<PHINode &>(*PN), + const_cast<PHINode &>(*NewPN), + const_cast<PHINode &>(*PN), DT); + } return replaceInstUsesWith(I, NewPN); } @@ -2756,6 +2768,111 @@ static bool shouldCanonicalizeGEPToPtrAdd(GetElementPtrInst &GEP) { }); } +static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, + IRBuilderBase &Builder) { + auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0)); + if (!Op1) + return nullptr; + + // Don't fold a GEP into itself through a PHI node. This can only happen + // through the back-edge of a loop. Folding a GEP into itself means that + // the value of the previous iteration needs to be stored in the meantime, + // thus requiring an additional register variable to be live, but not + // actually achieving anything (the GEP still needs to be executed once per + // loop iteration). + if (Op1 == &GEP) + return nullptr; + + int DI = -1; + + for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) { + auto *Op2 = dyn_cast<GetElementPtrInst>(*I); + if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands() || + Op1->getSourceElementType() != Op2->getSourceElementType()) + return nullptr; + + // As for Op1 above, don't try to fold a GEP into itself. + if (Op2 == &GEP) + return nullptr; + + // Keep track of the type as we walk the GEP. + Type *CurTy = nullptr; + + for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) { + if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType()) + return nullptr; + + if (Op1->getOperand(J) != Op2->getOperand(J)) { + if (DI == -1) { + // We have not seen any differences yet in the GEPs feeding the + // PHI yet, so we record this one if it is allowed to be a + // variable. + + // The first two arguments can vary for any GEP, the rest have to be + // static for struct slots + if (J > 1) { + assert(CurTy && "No current type?"); + if (CurTy->isStructTy()) + return nullptr; + } + + DI = J; + } else { + // The GEP is different by more than one input. While this could be + // extended to support GEPs that vary by more than one variable it + // doesn't make sense since it greatly increases the complexity and + // would result in an R+R+R addressing mode which no backend + // directly supports and would need to be broken into several + // simpler instructions anyway. + return nullptr; + } + } + + // Sink down a layer of the type for the next iteration. + if (J > 0) { + if (J == 1) { + CurTy = Op1->getSourceElementType(); + } else { + CurTy = + GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J)); + } + } + } + } + + // If not all GEPs are identical we'll have to create a new PHI node. + // Check that the old PHI node has only one use so that it will get + // removed. + if (DI != -1 && !PN->hasOneUse()) + return nullptr; + + auto *NewGEP = cast<GetElementPtrInst>(Op1->clone()); + if (DI == -1) { + // All the GEPs feeding the PHI are identical. Clone one down into our + // BB so that it can be merged with the current GEP. + } else { + // All the GEPs feeding the PHI differ at a single offset. Clone a GEP + // into the current block so it can be merged, and create a new PHI to + // set that index. + PHINode *NewPN; + { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(PN); + NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(), + PN->getNumOperands()); + } + + for (auto &I : PN->operands()) + NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI), + PN->getIncomingBlock(I)); + + NewGEP->setOperand(DI, NewPN); + } + + NewGEP->insertBefore(*GEP.getParent(), GEP.getParent()->getFirstInsertionPt()); + return NewGEP; +} + Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { Value *PtrOp = GEP.getOperand(0); SmallVector<Value *, 8> Indices(GEP.indices()); @@ -2846,107 +2963,8 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Check to see if the inputs to the PHI node are getelementptr instructions. if (auto *PN = dyn_cast<PHINode>(PtrOp)) { - auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0)); - if (!Op1) - return nullptr; - - // Don't fold a GEP into itself through a PHI node. This can only happen - // through the back-edge of a loop. Folding a GEP into itself means that - // the value of the previous iteration needs to be stored in the meantime, - // thus requiring an additional register variable to be live, but not - // actually achieving anything (the GEP still needs to be executed once per - // loop iteration). - if (Op1 == &GEP) - return nullptr; - - int DI = -1; - - for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) { - auto *Op2 = dyn_cast<GetElementPtrInst>(*I); - if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands() || - Op1->getSourceElementType() != Op2->getSourceElementType()) - return nullptr; - - // As for Op1 above, don't try to fold a GEP into itself. - if (Op2 == &GEP) - return nullptr; - - // Keep track of the type as we walk the GEP. - Type *CurTy = nullptr; - - for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) { - if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType()) - return nullptr; - - if (Op1->getOperand(J) != Op2->getOperand(J)) { - if (DI == -1) { - // We have not seen any differences yet in the GEPs feeding the - // PHI yet, so we record this one if it is allowed to be a - // variable. - - // The first two arguments can vary for any GEP, the rest have to be - // static for struct slots - if (J > 1) { - assert(CurTy && "No current type?"); - if (CurTy->isStructTy()) - return nullptr; - } - - DI = J; - } else { - // The GEP is different by more than one input. While this could be - // extended to support GEPs that vary by more than one variable it - // doesn't make sense since it greatly increases the complexity and - // would result in an R+R+R addressing mode which no backend - // directly supports and would need to be broken into several - // simpler instructions anyway. - return nullptr; - } - } - - // Sink down a layer of the type for the next iteration. - if (J > 0) { - if (J == 1) { - CurTy = Op1->getSourceElementType(); - } else { - CurTy = - GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J)); - } - } - } - } - - // If not all GEPs are identical we'll have to create a new PHI node. - // Check that the old PHI node has only one use so that it will get - // removed. - if (DI != -1 && !PN->hasOneUse()) - return nullptr; - - auto *NewGEP = cast<GetElementPtrInst>(Op1->clone()); - if (DI == -1) { - // All the GEPs feeding the PHI are identical. Clone one down into our - // BB so that it can be merged with the current GEP. - } else { - // All the GEPs feeding the PHI differ at a single offset. Clone a GEP - // into the current block so it can be merged, and create a new PHI to - // set that index. - PHINode *NewPN; - { - IRBuilderBase::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(PN); - NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(), - PN->getNumOperands()); - } - - for (auto &I : PN->operands()) - NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI), - PN->getIncomingBlock(I)); - - NewGEP->setOperand(DI, NewPN); - } - - NewGEP->insertBefore(*GEP.getParent(), GEP.getParent()->getFirstInsertionPt()); - return replaceOperand(GEP, 0, NewGEP); + if (Value *NewPtrOp = foldGEPOfPhi(GEP, PN, Builder)) + return replaceOperand(GEP, 0, NewPtrOp); } if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) @@ -3113,6 +3131,15 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { } } + // nusw + nneg -> nuw + if (GEP.hasNoUnsignedSignedWrap() && !GEP.hasNoUnsignedWrap() && + all_of(GEP.indices(), [&](Value *Idx) { + return isKnownNonNegative(Idx, SQ.getWithInstruction(&GEP)); + })) { + GEP.setNoWrapFlags(GEP.getNoWrapFlags() | GEPNoWrapFlags::noUnsignedWrap()); + return &GEP; + } + if (Instruction *R = foldSelectGEP(GEP, Builder)) return R; @@ -3443,7 +3470,7 @@ static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI, // Validate the rest of constraint #1 by matching on the pred branch. Instruction *TI = PredBB->getTerminator(); BasicBlock *TrueBB, *FalseBB; - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(TI, m_Br(m_ICmp(Pred, m_CombineOr(m_Specific(Op), m_Specific(Op->stripPointerCasts())), @@ -3724,7 +3751,7 @@ Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) { return replaceOperand(BI, 0, ConstantInt::getFalse(Cond->getType())); // Canonicalize, for example, fcmp_one -> fcmp_oeq. - CmpInst::Predicate Pred; + CmpPredicate Pred; if (match(Cond, m_OneUse(m_FCmp(Pred, m_Value(), m_Value()))) && !isCanonicalPredicate(Pred)) { // Swap destinations and condition. @@ -3785,7 +3812,7 @@ static Value *simplifySwitchOnSelectUsingRanges(SwitchInst &SI, if (CstBB != SI.getDefaultDest()) return nullptr; Value *X = Select->getOperand(3 - CstOpIdx); - ICmpInst::Predicate Pred; + CmpPredicate Pred; const APInt *RHSC; if (!match(Select->getCondition(), m_ICmp(Pred, m_Specific(X), m_APInt(RHSC)))) diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index b398a13383b9..41e503858124 100644 --- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -8,6 +8,7 @@ #include "llvm/Transforms/Instrumentation/BoundsChecking.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -36,15 +37,16 @@ using namespace llvm; static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap", cl::desc("Use one trap block per function")); -static cl::opt<bool> DebugTrapBB("bounds-checking-unique-traps", - cl::desc("Always use one trap per check")); - STATISTIC(ChecksAdded, "Bounds checks added"); STATISTIC(ChecksSkipped, "Bounds checks skipped"); STATISTIC(ChecksUnable, "Bounds checks unable to add"); using BuilderTy = IRBuilder<TargetFolder>; +BoundsCheckingPass::BoundsCheckingOptions::BoundsCheckingOptions( + ReportingMode Mode, bool Merge) + : Mode(Mode), Merge(Merge) {} + /// Gets the conditions under which memory accessing instructions will overflow. /// /// \p Ptr is the pointer that will be read/written, and \p InstVal is either @@ -104,6 +106,30 @@ static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal, return Or; } +static CallInst *InsertTrap(BuilderTy &IRB, bool DebugTrapBB) { + if (!DebugTrapBB) + return IRB.CreateIntrinsic(Intrinsic::trap, {}, {}); + // FIXME: Ideally we would use the SanitizerHandler::OutOfBounds constant. + return IRB.CreateIntrinsic( + Intrinsic::ubsantrap, {}, + ConstantInt::get(IRB.getInt8Ty(), + IRB.GetInsertBlock()->getParent()->size())); +} + +static CallInst *InsertCall(BuilderTy &IRB, bool MayReturn, StringRef Name) { + Function *Fn = IRB.GetInsertBlock()->getParent(); + LLVMContext &Ctx = Fn->getContext(); + llvm::AttrBuilder B(Ctx); + B.addAttribute(llvm::Attribute::NoUnwind); + if (!MayReturn) + B.addAttribute(llvm::Attribute::NoReturn); + FunctionCallee Callee = Fn->getParent()->getOrInsertFunction( + Name, + llvm::AttributeList::get(Ctx, llvm::AttributeList::FunctionIndex, B), + Type::getVoidTy(Ctx)); + return IRB.CreateCall(Callee); +} + /// Adds run-time bounds checks to memory accessing instructions. /// /// \p Or is the condition that should guard the trap. @@ -126,20 +152,56 @@ static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) { BasicBlock *Cont = OldBB->splitBasicBlock(SplitI); OldBB->getTerminator()->eraseFromParent(); + BasicBlock *TrapBB = GetTrapBB(IRB, Cont); + if (C) { // If we have a constant zero, unconditionally branch. // FIXME: We should really handle this differently to bypass the splitting // the block. - BranchInst::Create(GetTrapBB(IRB), OldBB); + BranchInst::Create(TrapBB, OldBB); return; } // Create the conditional branch. - BranchInst::Create(GetTrapBB(IRB), Cont, Or, OldBB); + BranchInst::Create(TrapBB, Cont, Or, OldBB); } +struct ReportingOpts { + bool MayReturn = false; + bool UseTrap = false; + bool MinRuntime = false; + bool MayMerge = true; + StringRef Name; + + ReportingOpts(BoundsCheckingPass::ReportingMode Mode, bool Merge) { + switch (Mode) { + case BoundsCheckingPass::ReportingMode::Trap: + UseTrap = true; + break; + case BoundsCheckingPass::ReportingMode::MinRuntime: + Name = "__ubsan_handle_local_out_of_bounds_minimal"; + MinRuntime = true; + MayReturn = true; + break; + case BoundsCheckingPass::ReportingMode::MinRuntimeAbort: + Name = "__ubsan_handle_local_out_of_bounds_minimal_abort"; + MinRuntime = true; + break; + case BoundsCheckingPass::ReportingMode::FullRuntime: + Name = "__ubsan_handle_local_out_of_bounds"; + MayReturn = true; + break; + case BoundsCheckingPass::ReportingMode::FullRuntimeAbort: + Name = "__ubsan_handle_local_out_of_bounds_abort"; + break; + } + + MayMerge = Merge; + } +}; + static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI, - ScalarEvolution &SE) { + ScalarEvolution &SE, const ReportingOpts &Opts) { if (F.hasFnAttribute(Attribute::NoSanitizeBounds)) return false; @@ -180,37 +242,43 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI, // Create a trapping basic block on demand using a callback. Depending on // flags, this will either create a single block for the entire function or // will create a fresh block every time it is called. - BasicBlock *TrapBB = nullptr; - auto GetTrapBB = [&TrapBB](BuilderTy &IRB) { + BasicBlock *ReuseTrapBB = nullptr; + auto GetTrapBB = [&ReuseTrapBB, &Opts](BuilderTy &IRB, BasicBlock *Cont) { Function *Fn = IRB.GetInsertBlock()->getParent(); auto DebugLoc = IRB.getCurrentDebugLocation(); IRBuilder<>::InsertPointGuard Guard(IRB); - if (TrapBB && SingleTrapBB && !DebugTrapBB) - return TrapBB; + // Create a trapping basic block on demand using a callback. Depending on + // flags, this will either create a single block for the entire function or + // will create a fresh block every time it is called. + if (ReuseTrapBB) + return ReuseTrapBB; - TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn); + BasicBlock *TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn); IRB.SetInsertPoint(TrapBB); - Intrinsic::ID IntrID = DebugTrapBB ? Intrinsic::ubsantrap : Intrinsic::trap; + bool DebugTrapBB = !Opts.MayMerge; + CallInst *TrapCall = Opts.UseTrap + ? InsertTrap(IRB, DebugTrapBB) + : InsertCall(IRB, Opts.MayReturn, Opts.Name); + if (DebugTrapBB) + TrapCall->addFnAttr(llvm::Attribute::NoMerge); - CallInst *TrapCall; - if (DebugTrapBB) { - TrapCall = IRB.CreateIntrinsic( - IntrID, {}, ConstantInt::get(IRB.getInt8Ty(), Fn->size())); + TrapCall->setDoesNotThrow(); + TrapCall->setDebugLoc(DebugLoc); + if (Opts.MayReturn) { + IRB.CreateBr(Cont); } else { - TrapCall = IRB.CreateIntrinsic(IntrID, {}, {}); + TrapCall->setDoesNotReturn(); + IRB.CreateUnreachable(); } - TrapCall->setDoesNotReturn(); - TrapCall->setDoesNotThrow(); - TrapCall->setDebugLoc(DebugLoc); - IRB.CreateUnreachable(); + if (!Opts.MayReturn && SingleTrapBB && !DebugTrapBB) + ReuseTrapBB = TrapBB; return TrapBB; }; - // Add the checks. for (const auto &Entry : TrapInfo) { Instruction *Inst = Entry.first; BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL)); @@ -224,8 +292,35 @@ PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager & auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); - if (!addBoundsChecking(F, TLI, SE)) + if (!addBoundsChecking(F, TLI, SE, + ReportingOpts(Options.Mode, Options.Merge))) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } + +void BoundsCheckingPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<BoundsCheckingPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + switch (Options.Mode) { + case ReportingMode::Trap: + OS << "<trap"; + break; + case ReportingMode::MinRuntime: + OS << "<min-rt"; + break; + case ReportingMode::MinRuntimeAbort: + OS << "<min-rt-abort"; + break; + case ReportingMode::FullRuntime: + OS << "<rt"; + break; + case ReportingMode::FullRuntimeAbort: + OS << "<rt-abort"; + break; + } + if (Options.Merge) + OS << ";merge"; + OS << ">"; +} diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 3e3c3eced4bb..5c437437fe36 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -19,11 +19,11 @@ add_llvm_component_library(LLVMInstrumentation PGOForceFunctionAttrs.cpp PGOInstrumentation.cpp PGOMemOPSizeOpt.cpp - PoisonChecking.cpp SanitizerCoverage.cpp SanitizerBinaryMetadata.cpp ValueProfileCollector.cpp ThreadSanitizer.cpp + TypeSanitizer.cpp HWAddressSanitizer.cpp RealtimeSanitizer.cpp diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index f9be7f933d31..6e86ffdc8027 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -61,7 +61,7 @@ enum : uint32_t { }; static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version", - cl::init("408*"), cl::Hidden, + cl::init("0000"), cl::Hidden, cl::ValueRequired); static cl::opt<bool> AtomicCounter("gcov-atomic-counter", cl::Hidden, @@ -154,6 +154,7 @@ private: GCOVOptions Options; llvm::endianness Endian; raw_ostream *os; + int Version = 0; // Checksum, produced by hash of EdgeDestinations SmallVector<uint32_t, 4> FileChecksums; @@ -334,12 +335,9 @@ namespace { : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident), Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) { LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); - bool ExitBlockBeforeBody = Version >= 48; - uint32_t i = ExitBlockBeforeBody ? 2 : 1; + uint32_t i = 2; for (BasicBlock &BB : *F) Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++))); - if (!ExitBlockBeforeBody) - ReturnBlock.Number = i; std::string FunctionNameAndLine; raw_string_ostream FNLOS(FunctionNameAndLine); @@ -363,44 +361,28 @@ namespace { void writeOut(uint32_t CfgChecksum) { write(GCOV_TAG_FUNCTION); SmallString<128> Filename = getFilename(SP); - uint32_t BlockLen = - 2 + (Version >= 47) + wordsOfString(getFunctionName(SP)); - if (Version < 80) - BlockLen += wordsOfString(Filename) + 1; - else - BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90); + uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP)); + BlockLen += 1 + wordsOfString(Filename) + 4; write(BlockLen); write(Ident); write(FuncChecksum); - if (Version >= 47) - write(CfgChecksum); + write(CfgChecksum); writeString(getFunctionName(SP)); - if (Version < 80) { - writeString(Filename); - write(SP->getLine()); - } else { - write(SP->isArtificial()); // artificial - writeString(Filename); - write(SP->getLine()); // start_line - write(0); // start_column - // EndLine is the last line with !dbg. It is not the } line as in GCC, - // but good enough. - write(EndLine); - if (Version >= 90) - write(0); // end_column - } + + write(SP->isArtificial()); // artificial + writeString(Filename); + write(SP->getLine()); // start_line + write(0); // start_column + // EndLine is the last line with !dbg. It is not the } line as in GCC, + // but good enough. + write(EndLine); + write(0); // end_column // Emit count of blocks. write(GCOV_TAG_BLOCKS); - if (Version < 80) { - write(Blocks.size() + 2); - for (int i = Blocks.size() + 2; i; --i) - write(0); - } else { - write(1); - write(Blocks.size() + 2); - } + write(1); + write(Blocks.size() + 2); LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n"); // Emit edges between blocks. @@ -767,7 +749,6 @@ bool GCOVProfiler::emitProfileNotes( function_ref<BlockFrequencyInfo *(Function &F)> GetBFI, function_ref<BranchProbabilityInfo *(Function &F)> GetBPI, function_ref<const TargetLibraryInfo &(Function &F)> GetTLI) { - int Version; { uint8_t c3 = Options.Version[0]; uint8_t c2 = Options.Version[1]; @@ -775,6 +756,11 @@ bool GCOVProfiler::emitProfileNotes( Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0' : (c3 - '0') * 10 + c1 - '0'; } + // Emit .gcno files that are compatible with GCC 11.1. + if (Version < 111) { + Version = 111; + memcpy(Options.Version, "B11*", 4); + } bool EmitGCDA = Options.EmitData; for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) { @@ -973,10 +959,8 @@ bool GCOVProfiler::emitProfileNotes( out.write(Tmp, 4); } write(Stamp); - if (Version >= 90) - writeString(""); // unuseful current_working_directory - if (Version >= 80) - write(0); // unuseful has_unexecuted_blocks + writeString("."); // unuseful current_working_directory + write(0); // unuseful has_unexecuted_blocks for (auto &Func : Funcs) Func->writeOut(Stamp); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index 33a7a37fa28e..f1580b025efc 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -152,7 +152,7 @@ static cl::opt<int> ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"), // override these hints anyway. static cl::opt<bool> ClMemProfMatchHotColdNew( "memprof-match-hot-cold-new", - cl::desc( + cl::desc( "Match allocation profiles onto existing hot/cold operator new calls"), cl::Hidden, cl::init(false)); @@ -166,8 +166,26 @@ static cl::opt<bool> "context in this module's profiles"), cl::Hidden, cl::init(false)); +static cl::opt<std::string> + MemprofRuntimeDefaultOptions("memprof-runtime-default-options", + cl::desc("The default memprof options"), + cl::Hidden, cl::init("")); + +static cl::opt<bool> + SalvageStaleProfile("memprof-salvage-stale-profile", + cl::desc("Salvage stale MemProf profile"), + cl::init(false), cl::Hidden); + +cl::opt<unsigned> MinClonedColdBytePercent( + "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, + cl::desc("Min percent of cold bytes to hint alloc cold during cloning")); + extern cl::opt<bool> MemProfReportHintedSizes; +static cl::opt<unsigned> MinMatchedColdBytePercent( + "memprof-matching-cold-threshold", cl::init(100), cl::Hidden, + cl::desc("Min percent of cold bytes matched to hint allocation cold")); + // Instrumentation statistics STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); @@ -547,6 +565,20 @@ void createMemprofHistogramFlagVar(Module &M) { appendToCompilerUsed(M, MemprofHistogramFlag); } +void createMemprofDefaultOptionsVar(Module &M) { + Constant *OptionsConst = ConstantDataArray::getString( + M.getContext(), MemprofRuntimeDefaultOptions, /*AddNull=*/true); + GlobalVariable *OptionsVar = + new GlobalVariable(M, OptionsConst->getType(), /*isConstant=*/true, + GlobalValue::WeakAnyLinkage, OptionsConst, + "__memprof_default_options_str"); + Triple TT(M.getTargetTriple()); + if (TT.supportsCOMDAT()) { + OptionsVar->setLinkage(GlobalValue::ExternalLinkage); + OptionsVar->setComdat(M.getOrInsertComdat(OptionsVar->getName())); + } +} + bool ModuleMemProfiler::instrumentModule(Module &M) { // Create a module constructor. @@ -566,6 +598,8 @@ bool ModuleMemProfiler::instrumentModule(Module &M) { createMemprofHistogramFlagVar(M); + createMemprofDefaultOptionsVar(M); + return true; } @@ -704,8 +738,7 @@ static uint64_t computeStackId(const memprof::Frame &Frame) { // Helper to generate a single hash id for a given callstack, used for emitting // matching statistics and useful for uniquing such statistics across modules. -static uint64_t -computeFullStackId(const std::vector<memprof::Frame> &CallStack) { +static uint64_t computeFullStackId(ArrayRef<Frame> CallStack) { llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little> HashBuilder; for (auto &F : CallStack) @@ -726,7 +759,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie, AllocInfo->Info.getAllocCount(), AllocInfo->Info.getTotalLifetime()); std::vector<ContextTotalSize> ContextSizeInfo; - if (MemProfReportHintedSizes) { + if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { auto TotalSize = AllocInfo->Info.getTotalSize(); assert(TotalSize); assert(FullStackId != 0); @@ -742,9 +775,8 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie, // non-zero. static bool stackFrameIncludesInlinedCallStack(ArrayRef<Frame> ProfileCallStack, - ArrayRef<uint64_t> InlinedCallStack, - unsigned StartIndex = 0) { - auto StackFrame = ProfileCallStack.begin() + StartIndex; + ArrayRef<uint64_t> InlinedCallStack) { + auto StackFrame = ProfileCallStack.begin(); auto InlCallStackIter = InlinedCallStack.begin(); for (; StackFrame != ProfileCallStack.end() && InlCallStackIter != InlinedCallStack.end(); @@ -800,7 +832,8 @@ struct AllocMatchInfo { }; DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> -memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI) { +memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI, + function_ref<bool(uint64_t)> IsPresentInProfile) { DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> Calls; auto GetOffset = [](const DILocation *DIL) { @@ -824,7 +857,12 @@ memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI) { continue; StringRef CalleeName = CalledFunction->getName(); + // True if we are calling a heap allocation function that supports + // hot/cold variants. bool IsAlloc = isAllocationWithHotColdVariant(CalledFunction, TLI); + // True for the first iteration below, indicating that we are looking at + // a leaf node. + bool IsLeaf = true; for (const DILocation *DIL = I.getDebugLoc(); DIL; DIL = DIL->getInlinedAt()) { StringRef CallerName = DIL->getSubprogramLinkageName(); @@ -833,16 +871,27 @@ memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI) { uint64_t CallerGUID = IndexedMemProfRecord::getGUID(CallerName); uint64_t CalleeGUID = IndexedMemProfRecord::getGUID(CalleeName); // Pretend that we are calling a function with GUID == 0 if we are - // calling a heap allocation function. - if (IsAlloc) - CalleeGUID = 0; + // in the inline stack leading to a heap allocation function. + if (IsAlloc) { + if (IsLeaf) { + // For leaf nodes, set CalleeGUID to 0 without consulting + // IsPresentInProfile. + CalleeGUID = 0; + } else if (!IsPresentInProfile(CalleeGUID)) { + // In addition to the leaf case above, continue to set CalleeGUID + // to 0 as long as we don't see CalleeGUID in the profile. + CalleeGUID = 0; + } else { + // Once we encounter a callee that exists in the profile, stop + // setting CalleeGUID to 0. + IsAlloc = false; + } + } + LineLocation Loc = {GetOffset(DIL), DIL->getColumn()}; Calls[CallerGUID].emplace_back(Loc, CalleeGUID); CalleeName = CallerName; - // FIXME: Recognize other frames that are associated with heap - // allocation functions. It may be too early to reset IsAlloc to - // false here. - IsAlloc = false; + IsLeaf = false; } } } @@ -865,7 +914,9 @@ memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader, DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromProfile = MemProfReader->getMemProfCallerCalleePairs(); DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromIR = - extractCallsFromIR(M, TLI); + extractCallsFromIR(M, TLI, [&](uint64_t GUID) { + return CallsFromProfile.contains(GUID); + }); // Compute an undrift map for each CallerGUID. for (const auto &[CallerGUID, IRAnchors] : CallsFromIR) { @@ -888,10 +939,38 @@ memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader, return UndriftMaps; } +// Given a MemProfRecord, undrift all the source locations present in the +// record in place. +static void +undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps, + memprof::MemProfRecord &MemProfRec) { + // Undrift a call stack in place. + auto UndriftCallStack = [&](std::vector<Frame> &CallStack) { + for (auto &F : CallStack) { + auto I = UndriftMaps.find(F.Function); + if (I == UndriftMaps.end()) + continue; + auto J = I->second.find(LineLocation(F.LineOffset, F.Column)); + if (J == I->second.end()) + continue; + auto &NewLoc = J->second; + F.LineOffset = NewLoc.LineOffset; + F.Column = NewLoc.Column; + } + }; + + for (auto &AS : MemProfRec.AllocSites) + UndriftCallStack(AS.CallStack); + + for (auto &CS : MemProfRec.CallSites) + UndriftCallStack(CS); +} + static void readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, const TargetLibraryInfo &TLI, - std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo) { + std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo, + DenseMap<uint64_t, LocToLocMap> &UndriftMaps) { auto &Ctx = M.getContext(); // Previously we used getIRPGOFuncName() here. If F is local linkage, // getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But @@ -939,6 +1018,11 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, NumOfMemProfFunc++; + // If requested, undrfit MemProfRecord so that the source locations in it + // match those in the IR. + if (SalvageStaleProfile) + undriftMemProfRecord(UndriftMaps, *MemProfRec); + // Detect if there are non-zero column numbers in the profile. If not, // treat all column numbers as 0 when matching (i.e. ignore any non-zero // columns in the IR). The profiled binary might have been built with @@ -948,9 +1032,15 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, // Build maps of the location hash to all profile data with that leaf location // (allocation info and the callsites). std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo; - // For the callsites we need to record the index of the associated frame in - // the frame array (see comments below where the map entries are added). - std::map<uint64_t, std::set<std::pair<const std::vector<Frame> *, unsigned>>> + // A hash function for std::unordered_set<ArrayRef<Frame>> to work. + struct CallStackHash { + size_t operator()(ArrayRef<Frame> CS) const { + return computeFullStackId(CS); + } + }; + // For the callsites we need to record slices of the frame array (see comments + // below where the map entries are added). + std::map<uint64_t, std::unordered_set<ArrayRef<Frame>, CallStackHash>> LocHashToCallSites; for (auto &AI : MemProfRec->AllocSites) { NumOfMemProfAllocContextProfiles++; @@ -968,7 +1058,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, unsigned Idx = 0; for (auto &StackFrame : CS) { uint64_t StackId = computeStackId(StackFrame); - LocHashToCallSites[StackId].insert(std::make_pair(&CS, Idx++)); + LocHashToCallSites[StackId].insert(ArrayRef<Frame>(CS).drop_front(Idx++)); ProfileHasColumns |= StackFrame.Column; // Once we find this function, we can stop recording. if (StackFrame.Function == FuncGUID) @@ -1008,8 +1098,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, // and another callsite). std::map<uint64_t, std::set<const AllocationInfo *>>::iterator AllocInfoIter; - std::map<uint64_t, std::set<std::pair<const std::vector<Frame> *, - unsigned>>>::iterator CallSitesIter; + decltype(LocHashToCallSites)::iterator CallSitesIter; for (const DILocation *DIL = I.getDebugLoc(); DIL != nullptr; DIL = DIL->getInlinedAt()) { // Use C++ linkage name if possible. Need to compile with @@ -1050,6 +1139,8 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, // contexts. Add them to a Trie specialized for trimming the contexts to // the minimal needed to disambiguate contexts with unique behavior. CallStackTrie AllocTrie; + uint64_t TotalSize = 0; + uint64_t TotalColdSize = 0; for (auto *AllocInfo : AllocInfoIter->second) { // Check the full inlined call stack against this one. // If we found and thus matched all frames on the call, include @@ -1058,9 +1149,13 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes) + if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes || + MinClonedColdBytePercent < 100) FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); + TotalSize += AllocInfo->Info.getTotalSize(); + if (AllocType == AllocationType::Cold) + TotalColdSize += AllocInfo->Info.getTotalSize(); // Record information about the allocation if match info printing // was requested. if (ClPrintMemProfMatchInfo) { @@ -1070,6 +1165,16 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, } } } + // If the threshold for the percent of cold bytes is less than 100%, + // and not all bytes are cold, see if we should still hint this + // allocation as cold without context sensitivity. + if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 && + TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) { + AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold, + "dominant"); + continue; + } + // We might not have matched any to the full inlined call stack. // But if we did, create and attach metadata, or a function attribute if // all contexts have identical profiled behavior. @@ -1100,8 +1205,8 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, for (auto CallStackIdx : CallSitesIter->second) { // If we found and thus matched all frames on the call, create and // attach call stack metadata. - if (stackFrameIncludesInlinedCallStack( - *CallStackIdx.first, InlinedCallStack, CallStackIdx.second)) { + if (stackFrameIncludesInlinedCallStack(CallStackIdx, + InlinedCallStack)) { NumOfMemProfMatchedCallSites++; addCallsiteMetadata(I, InlinedCallStack, Ctx); // Only need to find one with a matching call stack and add a single @@ -1152,6 +1257,11 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(*M.begin()); + DenseMap<uint64_t, LocToLocMap> UndriftMaps; + if (SalvageStaleProfile) + UndriftMaps = computeUndriftMap(M, MemProfReader.get(), TLI); + // Map from the stack has of each allocation context in the function profiles // to the total profiled size (bytes), allocation type, and whether we matched // it to an allocation in the IR. @@ -1162,7 +1272,8 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { continue; const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F); - readMemprof(M, F, MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo); + readMemprof(M, F, MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo, + UndriftMaps); } if (ClPrintMemProfMatchInfo) { diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 5c419c6374bd..429e323b6b7c 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4026,6 +4026,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOriginForNaryOp(I); } + // Approximation only + void handleNEONVectorMultiplyIntrinsic(IntrinsicInst &I) { + handleShadowOr(I); + } + void visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case Intrinsic::uadd_with_overflow: @@ -4341,6 +4346,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handlePclmulIntrinsic(I); break; + case Intrinsic::x86_avx_round_pd_256: + case Intrinsic::x86_avx_round_ps_256: case Intrinsic::x86_sse41_round_pd: case Intrinsic::x86_sse41_round_ps: handleRoundPdPsIntrinsic(I); @@ -4429,6 +4436,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { break; } + case Intrinsic::aarch64_neon_fmulx: + case Intrinsic::aarch64_neon_pmul: + case Intrinsic::aarch64_neon_pmull: + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_pmull64: + case Intrinsic::aarch64_neon_umull: { + handleNEONVectorMultiplyIntrinsic(I); + break; + } + default: if (!handleUnknownIntrinsic(I)) visitInstruction(I); diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp deleted file mode 100644 index e094acdc3178..000000000000 --- a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp +++ /dev/null @@ -1,358 +0,0 @@ -//===- PoisonChecking.cpp - -----------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Implements a transform pass which instruments IR such that poison semantics -// are made explicit. That is, it provides a (possibly partial) executable -// semantics for every instruction w.r.t. poison as specified in the LLVM -// LangRef. There are obvious parallels to the sanitizer tools, but this pass -// is focused purely on the semantics of LLVM IR, not any particular source -// language. If you're looking for something to see if your C/C++ contains -// UB, this is not it. -// -// The rewritten semantics of each instruction will include the following -// components: -// -// 1) The original instruction, unmodified. -// 2) A propagation rule which translates dynamic information about the poison -// state of each input to whether the dynamic output of the instruction -// produces poison. -// 3) A creation rule which validates any poison producing flags on the -// instruction itself (e.g. checks for overflow on nsw). -// 4) A check rule which traps (to a handler function) if this instruction must -// execute undefined behavior given the poison state of it's inputs. -// -// This is a must analysis based transform; that is, the resulting code may -// produce a false negative result (not report UB when actually exists -// according to the LangRef spec), but should never produce a false positive -// (report UB where it doesn't exist). -// -// Use cases for this pass include: -// - Understanding (and testing!) the implications of the definition of poison -// from the LangRef. -// - Validating the output of a IR fuzzer to ensure that all programs produced -// are well defined on the specific input used. -// - Finding/confirming poison specific miscompiles by checking the poison -// status of an input/IR pair is the same before and after an optimization -// transform. -// - Checking that a bugpoint reduction does not introduce UB which didn't -// exist in the original program being reduced. -// -// The major sources of inaccuracy are currently: -// - Most validation rules not yet implemented for instructions with poison -// relavant flags. At the moment, only nsw/nuw on add/sub are supported. -// - UB which is control dependent on a branch on poison is not yet -// reported. Currently, only data flow dependence is modeled. -// - Poison which is propagated through memory is not modeled. As such, -// storing poison to memory and then reloading it will cause a false negative -// as we consider the reloaded value to not be poisoned. -// - Poison propagation across function boundaries is not modeled. At the -// moment, all arguments and return values are assumed not to be poison. -// - Undef is not modeled. In particular, the optimizer's freedom to pick -// concrete values for undef bits so as to maximize potential for producing -// poison is not modeled. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Instrumentation/PoisonChecking.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" - -using namespace llvm; - -#define DEBUG_TYPE "poison-checking" - -static cl::opt<bool> -LocalCheck("poison-checking-function-local", - cl::init(false), - cl::desc("Check that returns are non-poison (for testing)")); - - -static bool isConstantFalse(Value* V) { - assert(V->getType()->isIntegerTy(1)); - if (auto *CI = dyn_cast<ConstantInt>(V)) - return CI->isZero(); - return false; -} - -static Value *buildOrChain(IRBuilder<> &B, ArrayRef<Value*> Ops) { - if (Ops.size() == 0) - return B.getFalse(); - unsigned i = 0; - for (; i < Ops.size() && isConstantFalse(Ops[i]); i++) {} - if (i == Ops.size()) - return B.getFalse(); - Value *Accum = Ops[i++]; - for (Value *Op : llvm::drop_begin(Ops, i)) - if (!isConstantFalse(Op)) - Accum = B.CreateOr(Accum, Op); - return Accum; -} - -static void generateCreationChecksForBinOp(Instruction &I, - SmallVectorImpl<Value*> &Checks) { - assert(isa<BinaryOperator>(I)); - - IRBuilder<> B(&I); - Value *LHS = I.getOperand(0); - Value *RHS = I.getOperand(1); - switch (I.getOpcode()) { - default: - return; - case Instruction::Add: { - if (I.hasNoSignedWrap()) { - auto *OverflowOp = - B.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, LHS, RHS); - Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); - } - if (I.hasNoUnsignedWrap()) { - auto *OverflowOp = - B.CreateBinaryIntrinsic(Intrinsic::uadd_with_overflow, LHS, RHS); - Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); - } - break; - } - case Instruction::Sub: { - if (I.hasNoSignedWrap()) { - auto *OverflowOp = - B.CreateBinaryIntrinsic(Intrinsic::ssub_with_overflow, LHS, RHS); - Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); - } - if (I.hasNoUnsignedWrap()) { - auto *OverflowOp = - B.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, LHS, RHS); - Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); - } - break; - } - case Instruction::Mul: { - if (I.hasNoSignedWrap()) { - auto *OverflowOp = - B.CreateBinaryIntrinsic(Intrinsic::smul_with_overflow, LHS, RHS); - Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); - } - if (I.hasNoUnsignedWrap()) { - auto *OverflowOp = - B.CreateBinaryIntrinsic(Intrinsic::umul_with_overflow, LHS, RHS); - Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); - } - break; - } - case Instruction::UDiv: { - if (I.isExact()) { - auto *Check = - B.CreateICmp(ICmpInst::ICMP_NE, B.CreateURem(LHS, RHS), - ConstantInt::get(LHS->getType(), 0)); - Checks.push_back(Check); - } - break; - } - case Instruction::SDiv: { - if (I.isExact()) { - auto *Check = - B.CreateICmp(ICmpInst::ICMP_NE, B.CreateSRem(LHS, RHS), - ConstantInt::get(LHS->getType(), 0)); - Checks.push_back(Check); - } - break; - } - case Instruction::AShr: - case Instruction::LShr: - case Instruction::Shl: { - Value *ShiftCheck = - B.CreateICmp(ICmpInst::ICMP_UGE, RHS, - ConstantInt::get(RHS->getType(), - LHS->getType()->getScalarSizeInBits())); - Checks.push_back(ShiftCheck); - break; - } - }; -} - -/// Given an instruction which can produce poison on non-poison inputs -/// (i.e. canCreatePoison returns true), generate runtime checks to produce -/// boolean indicators of when poison would result. -static void generateCreationChecks(Instruction &I, - SmallVectorImpl<Value*> &Checks) { - IRBuilder<> B(&I); - if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy()) - generateCreationChecksForBinOp(I, Checks); - - // Handle non-binops separately - switch (I.getOpcode()) { - default: - // Note there are a couple of missing cases here, once implemented, this - // should become an llvm_unreachable. - break; - case Instruction::ExtractElement: { - Value *Vec = I.getOperand(0); - auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType()); - if (!VecVTy) - break; - Value *Idx = I.getOperand(1); - unsigned NumElts = VecVTy->getNumElements(); - Value *Check = - B.CreateICmp(ICmpInst::ICMP_UGE, Idx, - ConstantInt::get(Idx->getType(), NumElts)); - Checks.push_back(Check); - break; - } - case Instruction::InsertElement: { - Value *Vec = I.getOperand(0); - auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType()); - if (!VecVTy) - break; - Value *Idx = I.getOperand(2); - unsigned NumElts = VecVTy->getNumElements(); - Value *Check = - B.CreateICmp(ICmpInst::ICMP_UGE, Idx, - ConstantInt::get(Idx->getType(), NumElts)); - Checks.push_back(Check); - break; - } - }; -} - -static Value *getPoisonFor(DenseMap<Value *, Value *> &ValToPoison, Value *V) { - auto Itr = ValToPoison.find(V); - if (Itr != ValToPoison.end()) - return Itr->second; - if (isa<Constant>(V)) { - return ConstantInt::getFalse(V->getContext()); - } - // Return false for unknwon values - this implements a non-strict mode where - // unhandled IR constructs are simply considered to never produce poison. At - // some point in the future, we probably want a "strict mode" for testing if - // nothing else. - return ConstantInt::getFalse(V->getContext()); -} - -static void CreateAssert(IRBuilder<> &B, Value *Cond) { - assert(Cond->getType()->isIntegerTy(1)); - if (auto *CI = dyn_cast<ConstantInt>(Cond)) - if (CI->isAllOnesValue()) - return; - - Module *M = B.GetInsertBlock()->getModule(); - M->getOrInsertFunction("__poison_checker_assert", - Type::getVoidTy(M->getContext()), - Type::getInt1Ty(M->getContext())); - Function *TrapFunc = M->getFunction("__poison_checker_assert"); - B.CreateCall(TrapFunc, Cond); -} - -static void CreateAssertNot(IRBuilder<> &B, Value *Cond) { - assert(Cond->getType()->isIntegerTy(1)); - CreateAssert(B, B.CreateNot(Cond)); -} - -static bool rewrite(Function &F) { - auto * const Int1Ty = Type::getInt1Ty(F.getContext()); - - DenseMap<Value *, Value *> ValToPoison; - - for (BasicBlock &BB : F) - for (auto I = BB.begin(); isa<PHINode>(&*I); I++) { - auto *OldPHI = cast<PHINode>(&*I); - auto *NewPHI = PHINode::Create(Int1Ty, OldPHI->getNumIncomingValues()); - for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) - NewPHI->addIncoming(UndefValue::get(Int1Ty), - OldPHI->getIncomingBlock(i)); - NewPHI->insertBefore(OldPHI); - ValToPoison[OldPHI] = NewPHI; - } - - for (BasicBlock &BB : F) - for (Instruction &I : BB) { - if (isa<PHINode>(I)) continue; - - IRBuilder<> B(cast<Instruction>(&I)); - - // Note: There are many more sources of documented UB, but this pass only - // attempts to find UB triggered by propagation of poison. - SmallVector<const Value *, 4> NonPoisonOps; - SmallPtrSet<const Value *, 4> SeenNonPoisonOps; - getGuaranteedNonPoisonOps(&I, NonPoisonOps); - for (const Value *Op : NonPoisonOps) - if (SeenNonPoisonOps.insert(Op).second) - CreateAssertNot(B, - getPoisonFor(ValToPoison, const_cast<Value *>(Op))); - - if (LocalCheck) - if (auto *RI = dyn_cast<ReturnInst>(&I)) - if (RI->getNumOperands() != 0) { - Value *Op = RI->getOperand(0); - CreateAssertNot(B, getPoisonFor(ValToPoison, Op)); - } - - SmallVector<Value*, 4> Checks; - for (const Use &U : I.operands()) { - if (ValToPoison.count(U) && propagatesPoison(U)) - Checks.push_back(getPoisonFor(ValToPoison, U)); - } - - if (canCreatePoison(cast<Operator>(&I))) - generateCreationChecks(I, Checks); - ValToPoison[&I] = buildOrChain(B, Checks); - } - - for (BasicBlock &BB : F) - for (auto I = BB.begin(); isa<PHINode>(&*I); I++) { - auto *OldPHI = cast<PHINode>(&*I); - if (!ValToPoison.count(OldPHI)) - continue; // skip the newly inserted phis - auto *NewPHI = cast<PHINode>(ValToPoison[OldPHI]); - for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) { - auto *OldVal = OldPHI->getIncomingValue(i); - NewPHI->setIncomingValue(i, getPoisonFor(ValToPoison, OldVal)); - } - } - return true; -} - - -PreservedAnalyses PoisonCheckingPass::run(Module &M, - ModuleAnalysisManager &AM) { - bool Changed = false; - for (auto &F : M) - Changed |= rewrite(F); - - return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); -} - -PreservedAnalyses PoisonCheckingPass::run(Function &F, - FunctionAnalysisManager &AM) { - return rewrite(F) ? PreservedAnalyses::none() : PreservedAnalyses::all(); -} - -/* Major TODO Items: - - Control dependent poison UB - - Strict mode - (i.e. must analyze every operand) - - Poison through memory - - Function ABIs - - Full coverage of intrinsics, etc.. (ouch) - - Instructions w/Unclear Semantics: - - shufflevector - It would seem reasonable for an out of bounds mask element - to produce poison, but the LangRef does not state. - - all binary ops w/vector operands - The likely interpretation would be that - any element overflowing should produce poison for the entire result, but - the LangRef does not state. - - Floating point binary ops w/fmf flags other than (nnan, noinfs). It seems - strange that only certian flags should be documented as producing poison. - - Cases of clear poison semantics not yet implemented: - - Exact flags on ashr/lshr produce poison - - NSW/NUW flags on shl produce poison - - Inbounds flag on getelementptr produce poison - - fptosi/fptoui (out of bounds input) produce poison - - Scalable vector types for insertelement/extractelement - - Floating point binary ops w/fmf nnan/noinfs flags produce poison - */ diff --git a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp index 88cb04695217..5ef6ffb58a7c 100644 --- a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp @@ -17,12 +17,16 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include "llvm/Demangle/Demangle.h" #include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h" using namespace llvm; +const char kRtsanModuleCtorName[] = "rtsan.module_ctor"; +const char kRtsanInitName[] = "__rtsan_ensure_initialized"; + static SmallVector<Type *> getArgTypes(ArrayRef<Value *> FunctionArgs) { SmallVector<Type *> Types; for (Value *Arg : FunctionArgs) @@ -76,16 +80,22 @@ static PreservedAnalyses runSanitizeRealtimeBlocking(Function &Fn) { return rtsanPreservedCFGAnalyses(); } -RealtimeSanitizerPass::RealtimeSanitizerPass( - const RealtimeSanitizerOptions &Options) {} +PreservedAnalyses RealtimeSanitizerPass::run(Module &M, + ModuleAnalysisManager &MAM) { + getOrCreateSanitizerCtorAndInitFunctions( + M, kRtsanModuleCtorName, kRtsanInitName, /*InitArgTypes=*/{}, + /*InitArgs=*/{}, + // This callback is invoked when the functions are created the first + // time. Hook them into the global ctors list in that case: + [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); }); -PreservedAnalyses RealtimeSanitizerPass::run(Function &Fn, - AnalysisManager<Function> &AM) { - if (Fn.hasFnAttribute(Attribute::SanitizeRealtime)) - return runSanitizeRealtime(Fn); + for (Function &F : M) { + if (F.hasFnAttribute(Attribute::SanitizeRealtime)) + runSanitizeRealtime(F); - if (Fn.hasFnAttribute(Attribute::SanitizeRealtimeBlocking)) - return runSanitizeRealtimeBlocking(Fn); + if (F.hasFnAttribute(Attribute::SanitizeRealtimeBlocking)) + runSanitizeRealtimeBlocking(F); + } - return PreservedAnalyses::all(); + return PreservedAnalyses::none(); } diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 22acf59c78a3..ac033d92e30d 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -1045,10 +1045,8 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, ->setCannotMerge(); // gets the PC using GET_CALLER_PC. } if (Options.TracePCGuard) { - auto GuardPtr = IRB.CreateIntToPtr( - IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy), - ConstantInt::get(IntptrTy, Idx * 4)), - PtrTy); + auto GuardPtr = IRB.CreateConstInBoundsGEP2_64( + FunctionGuardArray->getValueType(), FunctionGuardArray, 0, Idx); if (Options.GatedCallbacks) { Instruction *I = &*IP; auto GateBranch = CreateGateBranch(F, FunctionGateCmp, I); diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp new file mode 100644 index 000000000000..19610958e47b --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -0,0 +1,898 @@ +//===----- TypeSanitizer.cpp - type-based-aliasing-violation detector -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of TypeSanitizer, a type-based-aliasing-violation +// detector. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation/TypeSanitizer.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MD5.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +#include <cctype> + +using namespace llvm; + +#define DEBUG_TYPE "tysan" + +static const char *const kTysanModuleCtorName = "tysan.module_ctor"; +static const char *const kTysanInitName = "__tysan_init"; +static const char *const kTysanCheckName = "__tysan_check"; +static const char *const kTysanGVNamePrefix = "__tysan_v1_"; + +static const char *const kTysanShadowMemoryAddress = + "__tysan_shadow_memory_address"; +static const char *const kTysanAppMemMask = "__tysan_app_memory_mask"; + +static cl::opt<bool> + ClWritesAlwaysSetType("tysan-writes-always-set-type", + cl::desc("Writes always set the type"), cl::Hidden, + cl::init(false)); + +STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses"); + +namespace { + +/// TypeSanitizer: instrument the code in module to find type-based aliasing +/// violations. +struct TypeSanitizer { + TypeSanitizer(Module &M); + bool run(Function &F, const TargetLibraryInfo &TLI); + void instrumentGlobals(Module &M); + +private: + typedef SmallDenseMap<const MDNode *, GlobalVariable *, 8> + TypeDescriptorsMapTy; + typedef SmallDenseMap<const MDNode *, std::string, 8> TypeNameMapTy; + + void initializeCallbacks(Module &M); + + Instruction *getShadowBase(Function &F); + Instruction *getAppMemMask(Function &F); + + bool instrumentWithShadowUpdate(IRBuilder<> &IRB, const MDNode *TBAAMD, + Value *Ptr, uint64_t AccessSize, bool IsRead, + bool IsWrite, Value *ShadowBase, + Value *AppMemMask, bool ForceSetType, + bool SanitizeFunction, + TypeDescriptorsMapTy &TypeDescriptors, + const DataLayout &DL); + + /// Memory-related intrinsics/instructions reset the type of the destination + /// memory (including allocas and byval arguments). + bool instrumentMemInst(Value *I, Instruction *ShadowBase, + Instruction *AppMemMask, const DataLayout &DL); + + std::string getAnonymousStructIdentifier(const MDNode *MD, + TypeNameMapTy &TypeNames); + bool generateTypeDescriptor(const MDNode *MD, + TypeDescriptorsMapTy &TypeDescriptors, + TypeNameMapTy &TypeNames, Module &M); + bool generateBaseTypeDescriptor(const MDNode *MD, + TypeDescriptorsMapTy &TypeDescriptors, + TypeNameMapTy &TypeNames, Module &M); + + const Triple TargetTriple; + Regex AnonNameRegex; + Type *IntptrTy; + uint64_t PtrShift; + IntegerType *OrdTy; + + /// Callbacks to run-time library are computed in initializeCallbacks. + FunctionCallee TysanCheck; + FunctionCallee TysanCtorFunction; + + /// Callback to set types for gloabls. + Function *TysanGlobalsSetTypeFunction; +}; +} // namespace + +TypeSanitizer::TypeSanitizer(Module &M) + : TargetTriple(Triple(M.getTargetTriple())), + AnonNameRegex("^_ZTS.*N[1-9][0-9]*_GLOBAL__N") { + const DataLayout &DL = M.getDataLayout(); + IntptrTy = DL.getIntPtrType(M.getContext()); + PtrShift = countr_zero(IntptrTy->getPrimitiveSizeInBits() / 8); + + TysanGlobalsSetTypeFunction = M.getFunction("__tysan_set_globals_types"); + initializeCallbacks(M); +} + +void TypeSanitizer::initializeCallbacks(Module &M) { + IRBuilder<> IRB(M.getContext()); + OrdTy = IRB.getInt32Ty(); + + AttributeList Attr; + Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind); + // Initialize the callbacks. + TysanCheck = + M.getOrInsertFunction(kTysanCheckName, Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer to data to be read. + OrdTy, // Size of the data in bytes. + IRB.getPtrTy(), // Pointer to type descriptor. + OrdTy // Flags. + ); + + TysanCtorFunction = + M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy()); +} + +void TypeSanitizer::instrumentGlobals(Module &M) { + TysanGlobalsSetTypeFunction = nullptr; + + NamedMDNode *Globals = M.getNamedMetadata("llvm.tysan.globals"); + if (!Globals) + return; + + TysanGlobalsSetTypeFunction = Function::Create( + FunctionType::get(Type::getVoidTy(M.getContext()), false), + GlobalValue::InternalLinkage, "__tysan_set_globals_types", &M); + BasicBlock *BB = + BasicBlock::Create(M.getContext(), "", TysanGlobalsSetTypeFunction); + ReturnInst::Create(M.getContext(), BB); + + const DataLayout &DL = M.getDataLayout(); + Value *ShadowBase = getShadowBase(*TysanGlobalsSetTypeFunction); + Value *AppMemMask = getAppMemMask(*TysanGlobalsSetTypeFunction); + TypeDescriptorsMapTy TypeDescriptors; + TypeNameMapTy TypeNames; + + for (const auto &GMD : Globals->operands()) { + auto *GV = mdconst::dyn_extract_or_null<GlobalVariable>(GMD->getOperand(0)); + if (!GV) + continue; + const MDNode *TBAAMD = cast<MDNode>(GMD->getOperand(1)); + if (!generateBaseTypeDescriptor(TBAAMD, TypeDescriptors, TypeNames, M)) + continue; + + IRBuilder<> IRB( + TysanGlobalsSetTypeFunction->getEntryBlock().getTerminator()); + Type *AccessTy = GV->getValueType(); + assert(AccessTy->isSized()); + uint64_t AccessSize = DL.getTypeStoreSize(AccessTy); + instrumentWithShadowUpdate(IRB, TBAAMD, GV, AccessSize, false, false, + ShadowBase, AppMemMask, true, false, + TypeDescriptors, DL); + } + + if (TysanGlobalsSetTypeFunction) { + IRBuilder<> IRB(cast<Function>(TysanCtorFunction.getCallee()) + ->getEntryBlock() + .getTerminator()); + IRB.CreateCall(TysanGlobalsSetTypeFunction, {}); + } +} + +static const char LUT[] = "0123456789abcdef"; + +static std::string encodeName(StringRef Name) { + size_t Length = Name.size(); + std::string Output = kTysanGVNamePrefix; + Output.reserve(Output.size() + 3 * Length); + for (size_t i = 0; i < Length; ++i) { + const unsigned char c = Name[i]; + if (isalnum(c)) { + Output.push_back(c); + continue; + } + + if (c == '_') { + Output.append("__"); + continue; + } + + Output.push_back('_'); + Output.push_back(LUT[c >> 4]); + Output.push_back(LUT[c & 15]); + } + + return Output; +} + +std::string +TypeSanitizer::getAnonymousStructIdentifier(const MDNode *MD, + TypeNameMapTy &TypeNames) { + MD5 Hash; + + for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) { + const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i)); + if (!MemberNode) + return ""; + + auto TNI = TypeNames.find(MemberNode); + std::string MemberName; + if (TNI != TypeNames.end()) { + MemberName = TNI->second; + } else { + if (MemberNode->getNumOperands() < 1) + return ""; + MDString *MemberNameNode = dyn_cast<MDString>(MemberNode->getOperand(0)); + if (!MemberNameNode) + return ""; + MemberName = MemberNameNode->getString().str(); + if (MemberName.empty()) + MemberName = getAnonymousStructIdentifier(MemberNode, TypeNames); + if (MemberName.empty()) + return ""; + TypeNames[MemberNode] = MemberName; + } + + Hash.update(MemberName); + Hash.update("\0"); + + uint64_t Offset = + mdconst::extract<ConstantInt>(MD->getOperand(i + 1))->getZExtValue(); + Hash.update(utostr(Offset)); + Hash.update("\0"); + } + + MD5::MD5Result HashResult; + Hash.final(HashResult); + return "__anonymous_" + std::string(HashResult.digest().str()); +} + +bool TypeSanitizer::generateBaseTypeDescriptor( + const MDNode *MD, TypeDescriptorsMapTy &TypeDescriptors, + TypeNameMapTy &TypeNames, Module &M) { + if (MD->getNumOperands() < 1) + return false; + + MDString *NameNode = dyn_cast<MDString>(MD->getOperand(0)); + if (!NameNode) + return false; + + std::string Name = NameNode->getString().str(); + if (Name.empty()) + Name = getAnonymousStructIdentifier(MD, TypeNames); + if (Name.empty()) + return false; + TypeNames[MD] = Name; + std::string EncodedName = encodeName(Name); + + GlobalVariable *GV = + dyn_cast_or_null<GlobalVariable>(M.getNamedValue(EncodedName)); + if (GV) { + TypeDescriptors[MD] = GV; + return true; + } + + SmallVector<std::pair<Constant *, uint64_t>> Members; + for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) { + const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i)); + if (!MemberNode) + return false; + + Constant *Member; + auto TDI = TypeDescriptors.find(MemberNode); + if (TDI != TypeDescriptors.end()) { + Member = TDI->second; + } else { + if (!generateBaseTypeDescriptor(MemberNode, TypeDescriptors, TypeNames, + M)) + return false; + + Member = TypeDescriptors[MemberNode]; + } + + uint64_t Offset = + mdconst::extract<ConstantInt>(MD->getOperand(i + 1))->getZExtValue(); + + Members.push_back(std::make_pair(Member, Offset)); + } + + // The descriptor for a scalar is: + // [2, member count, [type pointer, offset]..., name] + + LLVMContext &C = MD->getContext(); + Constant *NameData = ConstantDataArray::getString(C, NameNode->getString()); + SmallVector<Type *> TDSubTys; + SmallVector<Constant *> TDSubData; + + auto PushTDSub = [&](Constant *C) { + TDSubTys.push_back(C->getType()); + TDSubData.push_back(C); + }; + + PushTDSub(ConstantInt::get(IntptrTy, 2)); + PushTDSub(ConstantInt::get(IntptrTy, Members.size())); + + // Types that are in an anonymous namespace are local to this module. + // FIXME: This should really be marked by the frontend in the metadata + // instead of having us guess this from the mangled name. Moreover, the regex + // here can pick up (unlikely) names in the non-reserved namespace (because + // it needs to search into the type to pick up cases where the type in the + // anonymous namespace is a template parameter, etc.). + bool ShouldBeComdat = !AnonNameRegex.match(NameNode->getString()); + for (auto &Member : Members) { + PushTDSub(Member.first); + PushTDSub(ConstantInt::get(IntptrTy, Member.second)); + } + + PushTDSub(NameData); + + StructType *TDTy = StructType::get(C, TDSubTys); + Constant *TD = ConstantStruct::get(TDTy, TDSubData); + + GlobalVariable *TDGV = + new GlobalVariable(TDTy, true, + !ShouldBeComdat ? GlobalValue::InternalLinkage + : GlobalValue::LinkOnceODRLinkage, + TD, EncodedName); + M.insertGlobalVariable(TDGV); + + if (ShouldBeComdat) { + if (TargetTriple.isOSBinFormatELF()) { + Comdat *TDComdat = M.getOrInsertComdat(EncodedName); + TDGV->setComdat(TDComdat); + } + appendToUsed(M, TDGV); + } + + TypeDescriptors[MD] = TDGV; + return true; +} + +bool TypeSanitizer::generateTypeDescriptor( + const MDNode *MD, TypeDescriptorsMapTy &TypeDescriptors, + TypeNameMapTy &TypeNames, Module &M) { + // Here we need to generate a type descriptor corresponding to this TBAA + // metadata node. Under the current scheme there are three kinds of TBAA + // metadata nodes: scalar nodes, struct nodes, and struct tag nodes. + + if (MD->getNumOperands() < 3) + return false; + + const MDNode *BaseNode = dyn_cast<MDNode>(MD->getOperand(0)); + if (!BaseNode) + return false; + + // This is a struct tag (element-access) node. + + const MDNode *AccessNode = dyn_cast<MDNode>(MD->getOperand(1)); + if (!AccessNode) + return false; + + Constant *Base; + auto TDI = TypeDescriptors.find(BaseNode); + if (TDI != TypeDescriptors.end()) { + Base = TDI->second; + } else { + if (!generateBaseTypeDescriptor(BaseNode, TypeDescriptors, TypeNames, M)) + return false; + + Base = TypeDescriptors[BaseNode]; + } + + Constant *Access; + TDI = TypeDescriptors.find(AccessNode); + if (TDI != TypeDescriptors.end()) { + Access = TDI->second; + } else { + if (!generateBaseTypeDescriptor(AccessNode, TypeDescriptors, TypeNames, M)) + return false; + + Access = TypeDescriptors[AccessNode]; + } + + uint64_t Offset = + mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue(); + std::string EncodedName = + std::string(Base->getName()) + "_o_" + utostr(Offset); + + GlobalVariable *GV = + dyn_cast_or_null<GlobalVariable>(M.getNamedValue(EncodedName)); + if (GV) { + TypeDescriptors[MD] = GV; + return true; + } + + // The descriptor for a scalar is: + // [1, base-type pointer, access-type pointer, offset] + + StructType *TDTy = + StructType::get(IntptrTy, Base->getType(), Access->getType(), IntptrTy); + Constant *TD = + ConstantStruct::get(TDTy, ConstantInt::get(IntptrTy, 1), Base, Access, + ConstantInt::get(IntptrTy, Offset)); + + bool ShouldBeComdat = cast<GlobalVariable>(Base)->getLinkage() == + GlobalValue::LinkOnceODRLinkage; + + GlobalVariable *TDGV = + new GlobalVariable(TDTy, true, + !ShouldBeComdat ? GlobalValue::InternalLinkage + : GlobalValue::LinkOnceODRLinkage, + TD, EncodedName); + M.insertGlobalVariable(TDGV); + + if (ShouldBeComdat) { + if (TargetTriple.isOSBinFormatELF()) { + Comdat *TDComdat = M.getOrInsertComdat(EncodedName); + TDGV->setComdat(TDComdat); + } + appendToUsed(M, TDGV); + } + + TypeDescriptors[MD] = TDGV; + return true; +} + +Instruction *TypeSanitizer::getShadowBase(Function &F) { + IRBuilder<> IRB(&F.front().front()); + Constant *GlobalShadowAddress = + F.getParent()->getOrInsertGlobal(kTysanShadowMemoryAddress, IntptrTy); + return IRB.CreateLoad(IntptrTy, GlobalShadowAddress, "shadow.base"); +} + +Instruction *TypeSanitizer::getAppMemMask(Function &F) { + IRBuilder<> IRB(&F.front().front()); + Value *GlobalAppMemMask = + F.getParent()->getOrInsertGlobal(kTysanAppMemMask, IntptrTy); + return IRB.CreateLoad(IntptrTy, GlobalAppMemMask, "app.mem.mask"); +} + +/// Collect all loads and stores, and for what TBAA nodes we need to generate +/// type descriptors. +void collectMemAccessInfo( + Function &F, const TargetLibraryInfo &TLI, + SmallVectorImpl<std::pair<Instruction *, MemoryLocation>> &MemoryAccesses, + SmallSetVector<const MDNode *, 8> &TBAAMetadata, + SmallVectorImpl<Value *> &MemTypeResetInsts) { + // Traverse all instructions, collect loads/stores/returns, check for calls. + for (Instruction &Inst : instructions(F)) { + // Skip memory accesses inserted by another instrumentation. + if (Inst.getMetadata(LLVMContext::MD_nosanitize)) + continue; + + if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst) || + isa<AtomicCmpXchgInst>(Inst) || isa<AtomicRMWInst>(Inst)) { + MemoryLocation MLoc = MemoryLocation::get(&Inst); + + // Swift errors are special (we can't introduce extra uses on them). + if (MLoc.Ptr->isSwiftError()) + continue; + + // Skip non-address-space-0 pointers; we don't know how to handle them. + Type *PtrTy = cast<PointerType>(MLoc.Ptr->getType()); + if (PtrTy->getPointerAddressSpace() != 0) + continue; + + if (MLoc.AATags.TBAA) + TBAAMetadata.insert(MLoc.AATags.TBAA); + MemoryAccesses.push_back(std::make_pair(&Inst, MLoc)); + } else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) { + if (CallInst *CI = dyn_cast<CallInst>(&Inst)) + maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI); + + if (isa<MemIntrinsic>(Inst)) { + MemTypeResetInsts.push_back(&Inst); + } else if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) + MemTypeResetInsts.push_back(&Inst); + } + } else if (isa<AllocaInst>(Inst)) { + MemTypeResetInsts.push_back(&Inst); + } + } +} + +bool TypeSanitizer::run(Function &F, const TargetLibraryInfo &TLI) { + // This is required to prevent instrumenting call to __tysan_init from within + // the module constructor. + if (&F == TysanCtorFunction.getCallee() || &F == TysanGlobalsSetTypeFunction) + return false; + initializeCallbacks(*F.getParent()); + + // We need to collect all loads and stores, and know for what TBAA nodes we + // need to generate type descriptors. + SmallVector<std::pair<Instruction *, MemoryLocation>> MemoryAccesses; + SmallSetVector<const MDNode *, 8> TBAAMetadata; + SmallVector<Value *> MemTypeResetInsts; + collectMemAccessInfo(F, TLI, MemoryAccesses, TBAAMetadata, MemTypeResetInsts); + + // byval arguments also need their types reset (they're new stack memory, + // just like allocas). + for (auto &A : F.args()) + if (A.hasByValAttr()) + MemTypeResetInsts.push_back(&A); + + Module &M = *F.getParent(); + TypeDescriptorsMapTy TypeDescriptors; + TypeNameMapTy TypeNames; + bool Res = false; + for (const MDNode *MD : TBAAMetadata) { + if (TypeDescriptors.count(MD)) + continue; + + if (!generateTypeDescriptor(MD, TypeDescriptors, TypeNames, M)) + return Res; // Giving up. + + Res = true; + } + + const DataLayout &DL = F.getParent()->getDataLayout(); + bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeType); + bool NeedsInstrumentation = + MemTypeResetInsts.empty() && MemoryAccesses.empty(); + Instruction *ShadowBase = NeedsInstrumentation ? nullptr : getShadowBase(F); + Instruction *AppMemMask = NeedsInstrumentation ? nullptr : getAppMemMask(F); + for (const auto &[I, MLoc] : MemoryAccesses) { + IRBuilder<> IRB(I); + assert(MLoc.Size.isPrecise()); + if (instrumentWithShadowUpdate( + IRB, MLoc.AATags.TBAA, const_cast<Value *>(MLoc.Ptr), + MLoc.Size.getValue(), I->mayReadFromMemory(), I->mayWriteToMemory(), + ShadowBase, AppMemMask, false, SanitizeFunction, TypeDescriptors, + DL)) { + ++NumInstrumentedAccesses; + Res = true; + } + } + + for (auto Inst : MemTypeResetInsts) + Res |= instrumentMemInst(Inst, ShadowBase, AppMemMask, DL); + + return Res; +} + +static Value *convertToShadowDataInt(IRBuilder<> &IRB, Value *Ptr, + Type *IntptrTy, uint64_t PtrShift, + Value *ShadowBase, Value *AppMemMask) { + return IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Ptr, IntptrTy, "app.ptr.int"), + AppMemMask, "app.ptr.masked"), + PtrShift, "app.ptr.shifted"), + ShadowBase, "shadow.ptr.int"); +} + +bool TypeSanitizer::instrumentWithShadowUpdate( + IRBuilder<> &IRB, const MDNode *TBAAMD, Value *Ptr, uint64_t AccessSize, + bool IsRead, bool IsWrite, Value *ShadowBase, Value *AppMemMask, + bool ForceSetType, bool SanitizeFunction, + TypeDescriptorsMapTy &TypeDescriptors, const DataLayout &DL) { + Constant *TDGV; + if (TBAAMD) + TDGV = TypeDescriptors[TBAAMD]; + else + TDGV = Constant::getNullValue(IRB.getPtrTy()); + + Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy()); + + Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift, + ShadowBase, AppMemMask); + Type *Int8PtrPtrTy = PointerType::get(IRB.getPtrTy(), 0); + Value *ShadowData = + IRB.CreateIntToPtr(ShadowDataInt, Int8PtrPtrTy, "shadow.ptr"); + + auto SetType = [&]() { + IRB.CreateStore(TD, ShadowData); + + // Now fill the remainder of the shadow memory corresponding to the + // remainder of the the bytes of the type with a bad type descriptor. + for (uint64_t i = 1; i < AccessSize; ++i) { + Value *BadShadowData = IRB.CreateIntToPtr( + IRB.CreateAdd(ShadowDataInt, + ConstantInt::get(IntptrTy, i << PtrShift), + "shadow.byte." + Twine(i) + ".offset"), + Int8PtrPtrTy, "shadow.byte." + Twine(i) + ".ptr"); + + // This is the TD value, -i, which is used to indicate that the byte is + // i bytes after the first byte of the type. + Value *BadTD = + IRB.CreateIntToPtr(ConstantInt::getSigned(IntptrTy, -i), + IRB.getPtrTy(), "bad.descriptor" + Twine(i)); + IRB.CreateStore(BadTD, BadShadowData); + } + }; + + if (ForceSetType || (ClWritesAlwaysSetType && IsWrite)) { + // In the mode where writes always set the type, for a write (which does + // not also read), we just set the type. + SetType(); + return true; + } + + assert((!ClWritesAlwaysSetType || IsRead) && + "should have handled case above"); + LLVMContext &C = IRB.getContext(); + MDNode *UnlikelyBW = MDBuilder(C).createBranchWeights(1, 100000); + + if (!SanitizeFunction) { + // If we're not sanitizing this function, then we only care whether we + // need to *set* the type. + Value *LoadedTD = IRB.CreateLoad(IRB.getPtrTy(), ShadowData, "shadow.desc"); + Value *NullTDCmp = IRB.CreateIsNull(LoadedTD, "desc.set"); + Instruction *NullTDTerm = SplitBlockAndInsertIfThen( + NullTDCmp, &*IRB.GetInsertPoint(), false, UnlikelyBW); + IRB.SetInsertPoint(NullTDTerm); + NullTDTerm->getParent()->setName("set.type"); + SetType(); + return true; + } + // We need to check the type here. If the type is unknown, then the read + // sets the type. If the type is known, then it is checked. If the type + // doesn't match, then we call the runtime (which may yet determine that + // the mismatch is okay). + // + // The checks generated below have the following strucutre. + // + // ; First we load the descriptor for the load from shadow memory and + // ; compare it against the type descriptor for the current access type. + // %shadow.desc = load ptr %shadow.data + // %bad.desc = icmp ne %shadow.desc, %td + // br %bad.desc, %bad.bb, %good.bb + // + // bad.bb: + // %shadow.desc.null = icmp eq %shadow.desc, null + // br %shadow.desc.null, %null.td.bb, %good.td.bb + // + // null.td.bb: + // ; The typ is unknown, set it if all bytes in the value are also unknown. + // ; To check, we load the shadow data for all bytes of the access. For the + // ; pseudo code below, assume an access of size 1. + // %shadow.data.int = add %shadow.data.int, 0 + // %l = load (inttoptr %shadow.data.int) + // %is.not.null = icmp ne %l, null + // %not.all.unknown = %is.not.null + // br %no.all.unknown, before.set.type.bb + // + // before.set.type.bb: + // ; Call runtime to check mismatch. + // call void @__tysan_check() + // br %set.type.bb + // + // set.type.bb: + // ; Now fill the remainder of the shadow memory corresponding to the + // ; remainder of the the bytes of the type with a bad type descriptor. + // store %TD, %shadow.data + // br %continue.bb + // + // good.td.bb:: + // ; We have a non-trivial mismatch. Call the runtime. + // call void @__tysan_check() + // br %continue.bb + // + // good.bb: + // ; We appear to have the right type. Make sure that all other bytes in + // ; the type are still marked as interior bytes. If not, call the runtime. + // %shadow.data.int = add %shadow.data.int, 0 + // %l = load (inttoptr %shadow.data.int) + // %not.all.interior = icmp sge %l, 0 + // br %not.all.interior, label %check.rt.bb, label %continue.bb + // + // check.rt.bb: + // call void @__tysan_check() + // br %continue.bb + + Constant *Flags = ConstantInt::get(OrdTy, int(IsRead) | (int(IsWrite) << 1)); + + Value *LoadedTD = IRB.CreateLoad(IRB.getPtrTy(), ShadowData, "shadow.desc"); + Value *BadTDCmp = IRB.CreateICmpNE(LoadedTD, TD, "bad.desc"); + Instruction *BadTDTerm, *GoodTDTerm; + SplitBlockAndInsertIfThenElse(BadTDCmp, &*IRB.GetInsertPoint(), &BadTDTerm, + &GoodTDTerm, UnlikelyBW); + IRB.SetInsertPoint(BadTDTerm); + + // We now know that the types did not match (we're on the slow path). If + // the type is unknown, then set it. + Value *NullTDCmp = IRB.CreateIsNull(LoadedTD); + Instruction *NullTDTerm, *MismatchTerm; + SplitBlockAndInsertIfThenElse(NullTDCmp, &*IRB.GetInsertPoint(), &NullTDTerm, + &MismatchTerm); + + // If the type is unknown, then set the type. + IRB.SetInsertPoint(NullTDTerm); + + // We're about to set the type. Make sure that all bytes in the value are + // also of unknown type. + Value *Size = ConstantInt::get(OrdTy, AccessSize); + Value *NotAllUnkTD = IRB.getFalse(); + for (uint64_t i = 1; i < AccessSize; ++i) { + Value *UnkShadowData = IRB.CreateIntToPtr( + IRB.CreateAdd(ShadowDataInt, ConstantInt::get(IntptrTy, i << PtrShift)), + Int8PtrPtrTy); + Value *ILdTD = IRB.CreateLoad(IRB.getPtrTy(), UnkShadowData); + NotAllUnkTD = IRB.CreateOr(NotAllUnkTD, IRB.CreateIsNotNull(ILdTD)); + } + + Instruction *BeforeSetType = &*IRB.GetInsertPoint(); + Instruction *BadUTDTerm = + SplitBlockAndInsertIfThen(NotAllUnkTD, BeforeSetType, false, UnlikelyBW); + IRB.SetInsertPoint(BadUTDTerm); + IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()), Size, + (Value *)TD, (Value *)Flags}); + + IRB.SetInsertPoint(BeforeSetType); + SetType(); + + // We have a non-trivial mismatch. Call the runtime. + IRB.SetInsertPoint(MismatchTerm); + IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()), Size, + (Value *)TD, (Value *)Flags}); + + // We appear to have the right type. Make sure that all other bytes in + // the type are still marked as interior bytes. If not, call the runtime. + IRB.SetInsertPoint(GoodTDTerm); + Value *NotAllBadTD = IRB.getFalse(); + for (uint64_t i = 1; i < AccessSize; ++i) { + Value *BadShadowData = IRB.CreateIntToPtr( + IRB.CreateAdd(ShadowDataInt, ConstantInt::get(IntptrTy, i << PtrShift)), + Int8PtrPtrTy); + Value *ILdTD = IRB.CreatePtrToInt( + IRB.CreateLoad(IRB.getPtrTy(), BadShadowData), IntptrTy); + NotAllBadTD = IRB.CreateOr( + NotAllBadTD, IRB.CreateICmpSGE(ILdTD, ConstantInt::get(IntptrTy, 0))); + } + + Instruction *BadITDTerm = SplitBlockAndInsertIfThen( + NotAllBadTD, &*IRB.GetInsertPoint(), false, UnlikelyBW); + IRB.SetInsertPoint(BadITDTerm); + IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()), Size, + (Value *)TD, (Value *)Flags}); + return true; +} + +bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, + Instruction *AppMemMask, + const DataLayout &DL) { + BasicBlock::iterator IP; + BasicBlock *BB; + Function *F; + + if (auto *I = dyn_cast<Instruction>(V)) { + IP = BasicBlock::iterator(I); + BB = I->getParent(); + F = BB->getParent(); + } else { + auto *A = cast<Argument>(V); + F = A->getParent(); + BB = &F->getEntryBlock(); + IP = BB->getFirstInsertionPt(); + + // Find the next insert point after both ShadowBase and AppMemMask. + if (IP->comesBefore(ShadowBase)) + IP = ShadowBase->getNextNode()->getIterator(); + if (IP->comesBefore(AppMemMask)) + IP = AppMemMask->getNextNode()->getIterator(); + } + + Value *Dest, *Size, *Src = nullptr; + bool NeedsMemMove = false; + IRBuilder<> IRB(BB, IP); + + if (auto *A = dyn_cast<Argument>(V)) { + assert(A->hasByValAttr() && "Type reset for non-byval argument?"); + + Dest = A; + Size = + ConstantInt::get(IntptrTy, DL.getTypeAllocSize(A->getParamByValType())); + } else { + auto *I = cast<Instruction>(V); + if (auto *MI = dyn_cast<MemIntrinsic>(I)) { + if (MI->getDestAddressSpace() != 0) + return false; + + Dest = MI->getDest(); + Size = MI->getLength(); + + if (auto *MTI = dyn_cast<MemTransferInst>(MI)) { + if (MTI->getSourceAddressSpace() == 0) { + Src = MTI->getSource(); + NeedsMemMove = isa<MemMoveInst>(MTI); + } + } + } else if (auto *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return false; + + Size = II->getArgOperand(0); + Dest = II->getArgOperand(1); + } else if (auto *AI = dyn_cast<AllocaInst>(I)) { + // We need to clear the types for new stack allocations (or else we might + // read stale type information from a previous function execution). + + IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(I))); + IRB.SetInstDebugLocation(I); + + Size = IRB.CreateMul( + IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy), + ConstantInt::get(IntptrTy, + DL.getTypeAllocSize(AI->getAllocatedType()))); + Dest = I; + } else { + return false; + } + } + + if (!ShadowBase) + ShadowBase = getShadowBase(*F); + if (!AppMemMask) + AppMemMask = getAppMemMask(*F); + + Value *ShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy()); + + if (!Src) { + IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift), + Align(1ull << PtrShift)); + return true; + } + + Value *SrcShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy()); + + if (NeedsMemMove) { + IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } else { + IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } + + return true; +} + +PreservedAnalyses TypeSanitizerPass::run(Function &F, + FunctionAnalysisManager &FAM) { + TypeSanitizer TySan(*F.getParent()); + TySan.run(F, FAM.getResult<TargetLibraryAnalysis>(F)); + return PreservedAnalyses::none(); +} + +PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M, + ModuleAnalysisManager &AM) { + Function *TysanCtorFunction; + std::tie(TysanCtorFunction, std::ignore) = + createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName, + kTysanInitName, /*InitArgTypes=*/{}, + /*InitArgs=*/{}); + + TypeSanitizer TySan(M); + TySan.instrumentGlobals(M); + appendToGlobalCtors(M, TysanCtorFunction, 0); + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp index b8571ba07489..bbc7a005b9ff 100644 --- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -132,7 +132,7 @@ static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To, if (!BI || !BI->isConditional()) return; - CmpInst::Predicate Pred; + CmpPredicate Pred; Value *Cond = BI->getCondition(); if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant()))) return; @@ -142,7 +142,7 @@ static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To, if (isCondRelevantToAnyCallArgument(Cmp, CB)) Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To ? Pred - : Cmp->getInversePredicate()}); + : Cmp->getInverseCmpPredicate()}); } /// Record ICmp conditions relevant to any argument in CB following Pred's diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 8d1e793836c7..91a3c3f0d392 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -88,13 +88,12 @@ static Instruction *getContextInstForUse(Use &U) { namespace { /// Struct to express a condition of the form %Op0 Pred %Op1. struct ConditionTy { - CmpInst::Predicate Pred; - Value *Op0; - Value *Op1; + CmpPredicate Pred; + Value *Op0 = nullptr; + Value *Op1 = nullptr; - ConditionTy() - : Pred(CmpInst::BAD_ICMP_PREDICATE), Op0(nullptr), Op1(nullptr) {} - ConditionTy(CmpInst::Predicate Pred, Value *Op0, Value *Op1) + ConditionTy() = default; + ConditionTy(CmpPredicate Pred, Value *Op0, Value *Op1) : Pred(Pred), Op0(Op0), Op1(Op1) {} }; @@ -132,18 +131,17 @@ struct FactOrCheck { Ty(Ty) {} FactOrCheck(DomTreeNode *DTN, Use *U) - : U(U), DoesHold(CmpInst::BAD_ICMP_PREDICATE, nullptr, nullptr), - NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()), + : U(U), NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()), Ty(EntryTy::UseCheck) {} - FactOrCheck(DomTreeNode *DTN, CmpInst::Predicate Pred, Value *Op0, Value *Op1, - ConditionTy Precond = ConditionTy()) + FactOrCheck(DomTreeNode *DTN, CmpPredicate Pred, Value *Op0, Value *Op1, + ConditionTy Precond = {}) : Cond(Pred, Op0, Op1), DoesHold(Precond), NumIn(DTN->getDFSNumIn()), NumOut(DTN->getDFSNumOut()), Ty(EntryTy::ConditionFact) {} - static FactOrCheck getConditionFact(DomTreeNode *DTN, CmpInst::Predicate Pred, + static FactOrCheck getConditionFact(DomTreeNode *DTN, CmpPredicate Pred, Value *Op0, Value *Op1, - ConditionTy Precond = ConditionTy()) { + ConditionTy Precond = {}) { return FactOrCheck(DTN, Pred, Op0, Op1, Precond); } @@ -218,7 +216,7 @@ struct StackEntry { StackEntry(unsigned NumIn, unsigned NumOut, bool IsSigned, SmallVector<Value *, 2> ValuesToRelease) : NumIn(NumIn), NumOut(NumOut), IsSigned(IsSigned), - ValuesToRelease(ValuesToRelease) {} + ValuesToRelease(std::move(ValuesToRelease)) {} }; struct ConstraintTy { @@ -521,11 +519,21 @@ static Decomposition decompose(Value *V, else if (match(V, m_NNegZExt(m_Value(Op0)))) { V = Op0; IsKnownNonNegative = true; + } else if (match(V, m_NSWTrunc(m_Value(Op0)))) { + if (Op0->getType()->getScalarSizeInBits() <= 64) + V = Op0; } if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) return MergeResults(Op0, Op1, IsSigned); + if (match(V, m_NSWSub(m_Value(Op0), m_Value(Op1)))) { + auto ResA = decompose(Op0, Preconditions, IsSigned, DL); + auto ResB = decompose(Op1, Preconditions, IsSigned, DL); + ResA.sub(ResB); + return ResA; + } + ConstantInt *CI; if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI))) && canUseSExt(CI)) { auto Result = decompose(Op0, Preconditions, IsSigned, DL); @@ -558,12 +566,19 @@ static Decomposition decompose(Value *V, if (match(V, m_ZExt(m_Value(Op0)))) { IsKnownNonNegative = true; V = Op0; - } - - if (match(V, m_SExt(m_Value(Op0)))) { + } else if (match(V, m_SExt(m_Value(Op0)))) { V = Op0; Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0, ConstantInt::get(Op0->getType(), 0)); + } else if (auto *Trunc = dyn_cast<TruncInst>(V)) { + if (Trunc->getSrcTy()->getScalarSizeInBits() <= 64) { + if (Trunc->hasNoUnsignedWrap() || Trunc->hasNoSignedWrap()) { + V = Trunc->getOperand(0); + if (!Trunc->hasNoUnsignedWrap()) + Preconditions.emplace_back(CmpInst::ICMP_SGE, V, + ConstantInt::get(V->getType(), 0)); + } + } } Value *Op1; @@ -711,8 +726,8 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, } for (const auto &KV : VariablesB) { - if (SubOverflow(R[GetOrAddIndex(KV.Variable)], KV.Coefficient, - R[GetOrAddIndex(KV.Variable)])) + auto &Coeff = R[GetOrAddIndex(KV.Variable)]; + if (SubOverflow(Coeff, KV.Coefficient, Coeff)) return {}; auto I = KnownNonNegativeVariables.insert({KV.Variable, KV.IsKnownNonNegative}); @@ -744,9 +759,9 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, if (!KV.second || (!Value2Index.contains(KV.first) && !NewIndexMap.contains(KV.first))) continue; - SmallVector<int64_t, 8> C(Value2Index.size() + NewVariables.size() + 1, 0); + auto &C = Res.ExtraInfo.emplace_back( + Value2Index.size() + NewVariables.size() + 1, 0); C[GetOrAddIndex(KV.first)] = -1; - Res.ExtraInfo.push_back(C); } return Res; } @@ -912,7 +927,7 @@ void State::addInfoForInductions(BasicBlock &BB) { Value *A; Value *B; - CmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(BB.getTerminator(), m_Br(m_ICmp(Pred, m_Value(A), m_Value(B)), m_Value(), m_Value()))) @@ -1079,7 +1094,7 @@ void State::addInfoFor(BasicBlock &BB) { switch (ID) { case Intrinsic::assume: { Value *A, *B; - CmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(I.getOperand(0), m_ICmp(Pred, m_Value(A), m_Value(B)))) break; if (GuaranteedToExecute) { @@ -1166,8 +1181,7 @@ void State::addInfoFor(BasicBlock &BB) { if (auto *Cmp = dyn_cast<ICmpInst>(Cur)) { WorkList.emplace_back(FactOrCheck::getConditionFact( DT.getNode(Successor), - IsOr ? CmpInst::getInversePredicate(Cmp->getPredicate()) - : Cmp->getPredicate(), + IsOr ? Cmp->getInverseCmpPredicate() : Cmp->getCmpPredicate(), Cmp->getOperand(0), Cmp->getOperand(1))); continue; } @@ -1191,13 +1205,12 @@ void State::addInfoFor(BasicBlock &BB) { return; if (canAddSuccessor(BB, Br->getSuccessor(0))) WorkList.emplace_back(FactOrCheck::getConditionFact( - DT.getNode(Br->getSuccessor(0)), CmpI->getPredicate(), + DT.getNode(Br->getSuccessor(0)), CmpI->getCmpPredicate(), CmpI->getOperand(0), CmpI->getOperand(1))); if (canAddSuccessor(BB, Br->getSuccessor(1))) WorkList.emplace_back(FactOrCheck::getConditionFact( - DT.getNode(Br->getSuccessor(1)), - CmpInst::getInversePredicate(CmpI->getPredicate()), CmpI->getOperand(0), - CmpI->getOperand(1))); + DT.getNode(Br->getSuccessor(1)), CmpI->getInverseCmpPredicate(), + CmpI->getOperand(0), CmpI->getOperand(1))); } #ifndef NDEBUG @@ -1527,7 +1540,7 @@ static bool checkOrAndOpImpliedByOther( while (!Worklist.empty()) { Value *Val = Worklist.pop_back_val(); Value *LHS, *RHS; - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (match(Val, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) { // For OR, check if the negated condition implies CmpToCheck. if (IsOr) @@ -1578,53 +1591,52 @@ void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B, LLVM_DEBUG(dbgs() << "Adding '"; dumpUnpackedICmp(dbgs(), Pred, A, B); dbgs() << "'\n"); - bool Added = false; auto &CSToUse = getCS(R.IsSigned); if (R.Coefficients.empty()) return; - Added |= CSToUse.addVariableRowFill(R.Coefficients); + bool Added = CSToUse.addVariableRowFill(R.Coefficients); + if (!Added) + return; // If R has been added to the system, add the new variables and queue it for // removal once it goes out-of-scope. - if (Added) { - SmallVector<Value *, 2> ValuesToRelease; - auto &Value2Index = getValue2Index(R.IsSigned); - for (Value *V : NewVariables) { - Value2Index.insert({V, Value2Index.size() + 1}); - ValuesToRelease.push_back(V); - } - - LLVM_DEBUG({ - dbgs() << " constraint: "; - dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned)); - dbgs() << "\n"; - }); + SmallVector<Value *, 2> ValuesToRelease; + auto &Value2Index = getValue2Index(R.IsSigned); + for (Value *V : NewVariables) { + Value2Index.insert({V, Value2Index.size() + 1}); + ValuesToRelease.push_back(V); + } - DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, - std::move(ValuesToRelease)); - - if (!R.IsSigned) { - for (Value *V : NewVariables) { - ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0), - false, false, false); - VarPos.Coefficients[Value2Index[V]] = -1; - CSToUse.addVariableRow(VarPos.Coefficients); - DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, - SmallVector<Value *, 2>()); - } - } + LLVM_DEBUG({ + dbgs() << " constraint: "; + dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned)); + dbgs() << "\n"; + }); - if (R.isEq()) { - // Also add the inverted constraint for equality constraints. - for (auto &Coeff : R.Coefficients) - Coeff *= -1; - CSToUse.addVariableRowFill(R.Coefficients); + DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, + std::move(ValuesToRelease)); + if (!R.IsSigned) { + for (Value *V : NewVariables) { + ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0), + false, false, false); + VarPos.Coefficients[Value2Index[V]] = -1; + CSToUse.addVariableRow(VarPos.Coefficients); DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, SmallVector<Value *, 2>()); } } + + if (R.isEq()) { + // Also add the inverted constraint for equality constraints. + for (auto &Coeff : R.Coefficients) + Coeff *= -1; + CSToUse.addVariableRowFill(R.Coefficients); + + DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, + SmallVector<Value *, 2>()); + } } static bool replaceSubOverflowUses(IntrinsicInst *II, Value *A, Value *B, @@ -1796,7 +1808,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, continue; } - auto AddFact = [&](CmpInst::Predicate Pred, Value *A, Value *B) { + auto AddFact = [&](CmpPredicate Pred, Value *A, Value *B) { LLVM_DEBUG(dbgs() << "Processing fact to add to the system: "; dumpUnpackedICmp(dbgs(), Pred, A, B); dbgs() << "\n"); if (Info.getCS(CmpInst::isSigned(Pred)).size() > MaxRows) { @@ -1810,7 +1822,18 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, if (ReproducerModule && DFSInStack.size() > ReproducerCondStack.size()) ReproducerCondStack.emplace_back(Pred, A, B); - Info.transferToOtherSystem(Pred, A, B, CB.NumIn, CB.NumOut, DFSInStack); + if (ICmpInst::isRelational(Pred)) { + // If samesign is present on the ICmp, simply flip the sign of the + // predicate, transferring the information from the signed system to the + // unsigned system, and viceversa. + if (Pred.hasSameSign()) + Info.addFact(ICmpInst::getFlippedSignednessPredicate(Pred), A, B, + CB.NumIn, CB.NumOut, DFSInStack); + else + Info.transferToOtherSystem(Pred, A, B, CB.NumIn, CB.NumOut, + DFSInStack); + } + if (ReproducerModule && DFSInStack.size() > ReproducerCondStack.size()) { // Add dummy entries to ReproducerCondStack to keep it in sync with // DFSInStack. @@ -1823,7 +1846,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, } }; - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (!CB.isConditionFact()) { Value *X; if (match(CB.Inst, m_Intrinsic<Intrinsic::abs>(m_Value(X)))) { diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 3c4a40fab3e0..8a5c506eed69 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -109,7 +109,7 @@ static cl::opt<unsigned> MaxNumVisitiedPaths( "dfa-max-num-visited-paths", cl::desc( "Max number of blocks visited while enumerating paths around a switch"), - cl::Hidden, cl::init(2000)); + cl::Hidden, cl::init(2500)); static cl::opt<unsigned> MaxNumPaths("dfa-max-num-paths", @@ -754,17 +754,15 @@ private: return Res; } - /// Walk the use-def chain and collect all the state-defining instructions. - /// - /// Return an empty map if unpredictable values encountered inside the basic - /// blocks of \p LoopPaths. + /// Walk the use-def chain and collect all the state-defining blocks and the + /// PHI nodes in those blocks that define the state. StateDefMap getStateDefMap() const { StateDefMap Res; - Value *FirstDef = Switch->getOperand(0); - assert(isa<PHINode>(FirstDef) && "The first definition must be a phi."); + PHINode *FirstDef = dyn_cast<PHINode>(Switch->getOperand(0)); + assert(FirstDef && "The first definition must be a phi."); SmallVector<PHINode *, 8> Stack; - Stack.push_back(dyn_cast<PHINode>(FirstDef)); + Stack.push_back(FirstDef); SmallSet<Value *, 16> SeenValues; while (!Stack.empty()) { @@ -774,18 +772,15 @@ private: SeenValues.insert(CurPhi); for (BasicBlock *IncomingBB : CurPhi->blocks()) { - Value *Incoming = CurPhi->getIncomingValueForBlock(IncomingBB); + PHINode *IncomingPhi = + dyn_cast<PHINode>(CurPhi->getIncomingValueForBlock(IncomingBB)); + if (!IncomingPhi) + continue; bool IsOutsideLoops = !SwitchOuterLoop->contains(IncomingBB); - if (Incoming == FirstDef || isa<ConstantInt>(Incoming) || - SeenValues.contains(Incoming) || IsOutsideLoops) { + if (SeenValues.contains(IncomingPhi) || IsOutsideLoops) continue; - } - - // Any unpredictable value inside the loops means we must bail out. - if (!isa<PHINode>(Incoming)) - return StateDefMap(); - Stack.push_back(cast<PHINode>(Incoming)); + Stack.push_back(IncomingPhi); } } diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 5555b5e29cc7..cae5b9c41a37 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -2054,7 +2054,7 @@ struct DSEState { return false; Instruction *ICmpL; - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(BI->getCondition(), m_c_ICmp(Pred, m_CombineAnd(m_Load(m_Specific(StorePtr)), diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index cd4846e00603..3a0ae6b01a11 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -192,7 +192,7 @@ static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A, // mechanism that may remove flags to increase the likelihood of CSE. Flavor = SPF_UNKNOWN; - CmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(Cond, m_ICmp(Pred, m_Specific(A), m_Specific(B)))) { // Check for commuted variants of min/max by swapping predicate. @@ -279,7 +279,7 @@ static unsigned getHashValueImpl(SimpleValue Val) { // Hash general selects to allow matching commuted true/false operands. // If we do not have a compare as the condition, just hash in the condition. - CmpInst::Predicate Pred; + CmpPredicate Pred; Value *X, *Y; if (!match(Cond, m_Cmp(Pred, m_Value(X), m_Value(Y)))) return hash_combine(Inst->getOpcode(), Cond, A, B); @@ -290,7 +290,8 @@ static unsigned getHashValueImpl(SimpleValue Val) { Pred = CmpInst::getInversePredicate(Pred); std::swap(A, B); } - return hash_combine(Inst->getOpcode(), Pred, X, Y, A, B); + return hash_combine(Inst->getOpcode(), + static_cast<CmpInst::Predicate>(Pred), X, Y, A, B); } if (CastInst *CI = dyn_cast<CastInst>(Inst)) @@ -451,7 +452,7 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) { // this code, as we simplify the double-negation before hashing the second // select (and so still succeed at CSEing them). if (LHSA == RHSB && LHSB == RHSA) { - CmpInst::Predicate PredL, PredR; + CmpPredicate PredL, PredR; Value *X, *Y; if (match(CondL, m_Cmp(PredL, m_Value(X), m_Value(Y))) && match(CondR, m_Cmp(PredR, m_Specific(X), m_Specific(Y))) && diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp index a8fda0c6ab9c..2978b7990a6e 100644 --- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -727,7 +727,7 @@ GuardWideningImpl::mergeChecks(SmallVectorImpl<Value *> &ChecksToHoist, // L >u C0 && L >u C1 -> L >u max(C0, C1) ConstantInt *RHS0, *RHS1; Value *LHS; - ICmpInst::Predicate Pred0, Pred1; + CmpPredicate Pred0, Pred1; // TODO: Support searching for pairs to merge from both whole lists of // ChecksToHoist and ChecksToWiden. if (ChecksToWiden.size() == 1 && ChecksToHoist.size() == 1 && diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 0bc783412595..e706a6f83b1e 100644 --- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -105,8 +105,8 @@ static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden, static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks", cl::Hidden, cl::init(false)); -static cl::opt<unsigned> MinRuntimeIterations("irce-min-runtime-iterations", - cl::Hidden, cl::init(10)); +static cl::opt<unsigned> MinEliminatedChecks("irce-min-eliminated-checks", + cl::Hidden, cl::init(10)); static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch", cl::Hidden, cl::init(true)); @@ -130,15 +130,9 @@ static cl::opt<bool> namespace { -/// An inductive range check is conditional branch in a loop with -/// -/// 1. a very cold successor (i.e. the branch jumps to that successor very -/// rarely) -/// -/// and -/// -/// 2. a condition that is provably true for some contiguous range of values -/// taken by the containing loop's induction variable. +/// An inductive range check is conditional branch in a loop with a condition +/// that is provably true for some contiguous range of values taken by the +/// containing loop's induction variable. /// class InductiveRangeCheck { @@ -233,6 +227,7 @@ public: /// checks, and hence don't end up in \p Checks. static void extractRangeChecksFromBranch( BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI, + std::optional<uint64_t> EstimatedTripCount, SmallVectorImpl<InductiveRangeCheck> &Checks, bool &Changed); }; @@ -246,9 +241,10 @@ class InductiveRangeCheckElimination { std::optional<llvm::function_ref<llvm::BlockFrequencyInfo &()>>; GetBFIFunc GetBFI; - // Returns true if it is profitable to do a transform basing on estimation of - // number of iterations. - bool isProfitableToTransform(const Loop &L); + // Returns the estimated number of iterations based on block frequency info if + // available, or on branch probability info. Nullopt is returned if the number + // of iterations cannot be estimated. + std::optional<uint64_t> estimatedTripCount(const Loop &L); public: InductiveRangeCheckElimination(ScalarEvolution &SE, @@ -522,6 +518,7 @@ void InductiveRangeCheck::extractRangeChecksFromCond( void InductiveRangeCheck::extractRangeChecksFromBranch( BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI, + std::optional<uint64_t> EstimatedTripCount, SmallVectorImpl<InductiveRangeCheck> &Checks, bool &Changed) { if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch()) return; @@ -529,11 +526,32 @@ void InductiveRangeCheck::extractRangeChecksFromBranch( unsigned IndexLoopSucc = L->contains(BI->getSuccessor(0)) ? 0 : 1; assert(L->contains(BI->getSuccessor(IndexLoopSucc)) && "No edges coming to loop?"); - BranchProbability LikelyTaken(15, 16); - if (!SkipProfitabilityChecks && BPI && - BPI->getEdgeProbability(BI->getParent(), IndexLoopSucc) < LikelyTaken) - return; + if (!SkipProfitabilityChecks && BPI) { + auto SuccessProbability = + BPI->getEdgeProbability(BI->getParent(), IndexLoopSucc); + if (EstimatedTripCount) { + auto EstimatedEliminatedChecks = + SuccessProbability.scale(*EstimatedTripCount); + if (EstimatedEliminatedChecks < MinEliminatedChecks) { + LLVM_DEBUG(dbgs() << "irce: could not prove profitability for branch " + << *BI << ": " + << "estimated eliminated checks too low " + << EstimatedEliminatedChecks << "\n";); + return; + } + } else { + BranchProbability LikelyTaken(15, 16); + if (SuccessProbability < LikelyTaken) { + LLVM_DEBUG(dbgs() << "irce: could not prove profitability for branch " + << *BI << ": " + << "could not estimate trip count " + << "and branch success probability too low " + << SuccessProbability << "\n";); + return; + } + } + } // IRCE expects branch's true edge comes to loop. Invert branch for opposite // case. @@ -938,42 +956,34 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) { return getLoopPassPreservedAnalyses(); } -bool InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L) { - if (SkipProfitabilityChecks) - return true; +std::optional<uint64_t> +InductiveRangeCheckElimination::estimatedTripCount(const Loop &L) { if (GetBFI) { BlockFrequencyInfo &BFI = (*GetBFI)(); uint64_t hFreq = BFI.getBlockFreq(L.getHeader()).getFrequency(); uint64_t phFreq = BFI.getBlockFreq(L.getLoopPreheader()).getFrequency(); - if (phFreq != 0 && hFreq != 0 && (hFreq / phFreq < MinRuntimeIterations)) { - LLVM_DEBUG(dbgs() << "irce: could not prove profitability: " - << "the estimated number of iterations basing on " - "frequency info is " << (hFreq / phFreq) << "\n";); - return false; - } - return true; + if (phFreq == 0 || hFreq == 0) + return std::nullopt; + return {hFreq / phFreq}; } if (!BPI) - return true; + return std::nullopt; auto *Latch = L.getLoopLatch(); if (!Latch) - return true; + return std::nullopt; auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator()); if (!LatchBr) - return true; - auto LatchBrExitIdx = LatchBr->getSuccessor(0) == L.getHeader() ? 1 : 0; + return std::nullopt; + auto LatchBrExitIdx = LatchBr->getSuccessor(0) == L.getHeader() ? 1 : 0; BranchProbability ExitProbability = BPI->getEdgeProbability(Latch, LatchBrExitIdx); - if (ExitProbability > BranchProbability(1, MinRuntimeIterations)) { - LLVM_DEBUG(dbgs() << "irce: could not prove profitability: " - << "the exit probability is too big " << ExitProbability - << "\n";); - return false; - } - return true; + if (ExitProbability.isUnknown() || ExitProbability.isZero()) + return std::nullopt; + + return {ExitProbability.scaleByInverse(1)}; } bool InductiveRangeCheckElimination::run( @@ -989,8 +999,14 @@ bool InductiveRangeCheckElimination::run( return false; } - if (!isProfitableToTransform(*L)) + auto EstimatedTripCount = estimatedTripCount(*L); + if (!SkipProfitabilityChecks && EstimatedTripCount && + *EstimatedTripCount < MinEliminatedChecks) { + LLVM_DEBUG(dbgs() << "irce: could not prove profitability: " + << "the estimated number of iterations is " + << *EstimatedTripCount << "\n"); return false; + } LLVMContext &Context = Preheader->getContext(); SmallVector<InductiveRangeCheck, 16> RangeChecks; @@ -998,8 +1014,8 @@ bool InductiveRangeCheckElimination::run( for (auto *BBI : L->getBlocks()) if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) - InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI, - RangeChecks, Changed); + InductiveRangeCheck::extractRangeChecksFromBranch( + TBI, L, SE, BPI, EstimatedTripCount, RangeChecks, Changed); if (RangeChecks.empty()) return Changed; diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 16110cd25bc6..300a564e222e 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -591,7 +591,7 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( // 'getPredicateOnEdge' method. This would be able to handle value // inequalities better, for example if the compare is "X < 4" and "X < 3" // is known true but "X < 4" itself is not available. - CmpInst::Predicate Pred; + CmpPredicate Pred; Value *Val; Constant *Cst; if (!PredCst && match(V, m_Cmp(Pred, m_Value(Val), m_Constant(Cst)))) @@ -2744,7 +2744,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred( // Pred is a predecessor of BB with an unconditional branch to BB. SI is // a Select instruction in Pred. BB has other predecessors and SI is used in // a PHI node in BB. SI has no other use. -// A new basic block, NewBB, is created and SI is converted to compare and +// A new basic block, NewBB, is created and SI is converted to compare and // conditional branch. SI is erased from parent. void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB, SelectInst *SI, PHINode *SIUse, diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 3ade32027289..a5d5eecb1ebf 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2430,8 +2430,8 @@ static bool hoistMinMax(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, } else return false; - auto MatchICmpAgainstInvariant = [&](Value *C, ICmpInst::Predicate &P, - Value *&LHS, Value *&RHS) { + auto MatchICmpAgainstInvariant = [&](Value *C, CmpPredicate &P, Value *&LHS, + Value *&RHS) { if (!match(C, m_OneUse(m_ICmp(P, m_Value(LHS), m_Value(RHS))))) return false; if (!LHS->getType()->isIntegerTy()) @@ -2448,12 +2448,13 @@ static bool hoistMinMax(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, P = ICmpInst::getInversePredicate(P); return true; }; - ICmpInst::Predicate P1, P2; + CmpPredicate P1, P2; Value *LHS1, *LHS2, *RHS1, *RHS2; if (!MatchICmpAgainstInvariant(Cond1, P1, LHS1, RHS1) || !MatchICmpAgainstInvariant(Cond2, P2, LHS2, RHS2)) return false; - if (P1 != P2 || LHS1 != LHS2) + // FIXME: Use CmpPredicate::getMatching here. + if (P1 != static_cast<CmpInst::Predicate>(P2) || LHS1 != LHS2) return false; // Everything is fine, we can do the transform. @@ -2678,7 +2679,7 @@ static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU, AssumptionCache *AC, DominatorTree *DT) { using namespace PatternMatch; - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *LHS, *RHS; if (!match(&I, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) return false; diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp index ff077624802b..73f1942849ac 100644 --- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp +++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp @@ -32,7 +32,7 @@ struct ConditionInfo { /// ICmp instruction with this condition ICmpInst *ICmp = nullptr; /// Preciate info - ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + CmpPredicate Pred = ICmpInst::BAD_ICMP_PREDICATE; /// AddRec llvm value Value *AddRecValue = nullptr; /// Non PHI AddRec llvm value diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 05cf638d3f09..ba1c2241aea9 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -2432,7 +2432,7 @@ static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, // Step 1: Check if the loop backedge is in desirable form. - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *CmpLHS, *CmpRHS; BasicBlock *TrueBB, *FalseBB; if (!match(LoopHeaderBB->getTerminator(), @@ -2797,7 +2797,7 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE, // Step 1: Check if the loop backedge, condition is in desirable form. - ICmpInst::Predicate Pred; + CmpPredicate Pred; BasicBlock *TrueBB, *FalseBB; if (!match(LoopHeaderBB->getTerminator(), m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB), diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 29844c463075..796fba67ee25 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -924,8 +924,7 @@ public: match(B, m_Intrinsic<Intrinsic::matrix_transpose>( m_Value(BT), m_ConstantInt(), m_ConstantInt()))) { IRBuilder<> Builder(&I); - auto *Add = cast<Instruction>(Builder.CreateFAdd(AT, BT, "mfadd")); - setShapeInfo(Add, {R, C}); + auto *Add = Builder.CreateFAdd(AT, BT, "mfadd"); MatrixBuilder MBuilder(Builder); Instruction *NewInst = MBuilder.CreateMatrixTranspose( Add, R->getZExtValue(), C->getZExtValue(), "mfadd_t"); @@ -934,9 +933,13 @@ public: computeShapeInfoForInst(&I, ShapeMap) && "Shape of new instruction doesn't match original shape."); CleanupBinOp(I, A, B); - assert(computeShapeInfoForInst(Add, ShapeMap).value_or(ShapeMap[Add]) == - ShapeMap[Add] && - "Shape of updated addition doesn't match cached shape."); + if (auto *AddI = dyn_cast<Instruction>(Add)) { + setShapeInfo(AddI, {R, C}); + assert( + computeShapeInfoForInst(AddI, ShapeMap).value_or(ShapeMap[AddI]) == + ShapeMap[AddI] && + "Shape of updated addition doesn't match cached shape."); + } } } diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 0cba5d077da6..5f7cb92d239b 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -345,10 +345,14 @@ static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA, static void combineAAMetadata(Instruction *ReplInst, Instruction *I) { // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be // handled here, but combineMetadata doesn't support them yet - unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_invariant_group, - LLVMContext::MD_access_group}; + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_invariant_group, + LLVMContext::MD_access_group, LLVMContext::MD_prof, + LLVMContext::MD_memprof, LLVMContext::MD_callsite}; + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Use custom AA metadata combining handling instead of combineMetadata, which + // is meant for CSE and will drop any metadata not in the KnownIDs list. combineMetadata(ReplInst, I, KnownIDs, true); } @@ -787,43 +791,47 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. - auto *V = SI->getOperand(0); - if (Value *ByteVal = isBytewiseValue(V, DL)) { - if (Instruction *I = - tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { - BBI = I->getIterator(); // Don't invalidate iterator. - return true; - } + Value *V = SI->getOperand(0); + Value *ByteVal = isBytewiseValue(V, DL); + if (!ByteVal) + return false; - // If we have an aggregate, we try to promote it to memset regardless - // of opportunity for merging as it can expose optimization opportunities - // in subsequent passes. - auto *T = V->getType(); - if (T->isAggregateType()) { - uint64_t Size = DL.getTypeStoreSize(T); - IRBuilder<> Builder(SI); - auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, - SI->getAlign()); - M->copyMetadata(*SI, LLVMContext::MD_DIAssignID); + if (Instruction *I = + tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { + BBI = I->getIterator(); // Don't invalidate iterator. + return true; + } + + // If we have an aggregate, we try to promote it to memset regardless + // of opportunity for merging as it can expose optimization opportunities + // in subsequent passes. + auto *T = V->getType(); + if (!T->isAggregateType()) + return false; - LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); + TypeSize Size = DL.getTypeStoreSize(T); + if (Size.isScalable()) + return false; - // The newly inserted memset is immediately overwritten by the original - // store, so we do not need to rename uses. - auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI)); - auto *NewAccess = MSSAU->createMemoryAccessBefore(M, nullptr, StoreDef); - MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false); + IRBuilder<> Builder(SI); + auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, + SI->getAlign()); + M->copyMetadata(*SI, LLVMContext::MD_DIAssignID); - eraseInstruction(SI); - NumMemSetInfer++; + LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); - // Make sure we do not invalidate the iterator. - BBI = M->getIterator(); - return true; - } - } + // The newly inserted memset is immediately overwritten by the original + // store, so we do not need to rename uses. + auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI)); + auto *NewAccess = MSSAU->createMemoryAccessBefore(M, nullptr, StoreDef); + MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false); - return false; + eraseInstruction(SI); + NumMemSetInfer++; + + // Make sure we do not invalidate the iterator. + BBI = M->getIterator(); + return true; } bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index d80af26451ac..f6179cadab42 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -43,6 +43,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -83,6 +84,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -246,6 +248,7 @@ private: bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P); bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); + bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS); std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI); void clobberUse(Use &U); bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas); @@ -598,6 +601,7 @@ public: /// If this is true, the slices are never fully built and should be /// ignored. bool isEscaped() const { return PointerEscapingInstr; } + bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; } /// Support for iterating over the slices. /// @{ @@ -680,6 +684,7 @@ private: /// store a pointer to that here and abort trying to form slices of the /// alloca. This will be null if the alloca slices are analyzed successfully. Instruction *PointerEscapingInstr; + Instruction *PointerEscapingInstrReadOnly; /// The slices of the alloca. /// @@ -1390,6 +1395,19 @@ private: /// Disable SROA entirely if there are unhandled users of the alloca. void visitInstruction(Instruction &I) { PI.setAborted(&I); } + + void visitCallBase(CallBase &CB) { + // If the call operand is NoCapture ReadOnly, then we mark it as + // EscapedReadOnly. + if (CB.isDataOperand(U) && + CB.doesNotCapture(U->getOperandNo()) && + CB.onlyReadsMemory(U->getOperandNo())) { + PI.setEscapedReadOnly(&CB); + return; + } + + Base::visitCallBase(CB); + } }; AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) @@ -1397,7 +1415,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) AI(AI), #endif - PointerEscapingInstr(nullptr) { + PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) { SliceBuilder PB(DL, AI, *this); SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); if (PtrI.isEscaped() || PtrI.isAborted()) { @@ -1408,6 +1426,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) assert(PointerEscapingInstr && "Did not track a bad instruction"); return; } + PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst(); llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); }); @@ -1445,6 +1464,9 @@ void AllocaSlices::print(raw_ostream &OS) const { return; } + if (PointerEscapingInstrReadOnly) + OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n"; + OS << "Slices of alloca: " << AI << "\n"; for (const_iterator I = begin(), E = end(); I != E; ++I) print(OS, I); @@ -5454,6 +5476,88 @@ void SROA::clobberUse(Use &U) { } } +/// A basic LoadAndStorePromoter that does not remove store nodes. +class BasicLoadAndStorePromoter : public LoadAndStorePromoter { +public: + BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S, + Type *ZeroType) + : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {} + bool shouldDelete(Instruction *I) const override { + return !isa<StoreInst>(I) && !isa<AllocaInst>(I); + } + + Value *getValueToUseForAlloca(Instruction *I) const override { + return UndefValue::get(ZeroType); + } + +private: + Type *ZeroType; +}; + +bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) { + // Look through each "partition", looking for slices with the same start/end + // that do not overlap with any before them. The slices are sorted by + // increasing beginOffset. We don't use AS.partitions(), as it will use a more + // sophisticated algorithm that takes splittable slices into account. + auto PartitionBegin = AS.begin(); + auto PartitionEnd = PartitionBegin; + uint64_t BeginOffset = PartitionBegin->beginOffset(); + uint64_t EndOffset = PartitionBegin->endOffset(); + while (PartitionBegin != AS.end()) { + bool AllSameAndValid = true; + SmallVector<Instruction *> Insts; + Type *PartitionType = nullptr; + while (PartitionEnd != AS.end() && + (PartitionEnd->beginOffset() < EndOffset || + PartitionEnd->endOffset() <= EndOffset)) { + if (AllSameAndValid) { + AllSameAndValid &= PartitionEnd->beginOffset() == BeginOffset && + PartitionEnd->endOffset() == EndOffset; + Instruction *User = + cast<Instruction>(PartitionEnd->getUse()->getUser()); + if (auto *LI = dyn_cast<LoadInst>(User)) { + Type *UserTy = LI->getType(); + // LoadAndStorePromoter requires all the types to be the same. + if (!LI->isSimple() || (PartitionType && UserTy != PartitionType)) + AllSameAndValid = false; + PartitionType = UserTy; + Insts.push_back(User); + } else if (auto *SI = dyn_cast<StoreInst>(User)) { + Type *UserTy = SI->getValueOperand()->getType(); + if (!SI->isSimple() || (PartitionType && UserTy != PartitionType)) + AllSameAndValid = false; + PartitionType = UserTy; + Insts.push_back(User); + } else if (!isAssumeLikeIntrinsic(User)) { + AllSameAndValid = false; + } + } + EndOffset = std::max(EndOffset, PartitionEnd->endOffset()); + ++PartitionEnd; + } + + // So long as all the slices start and end offsets matched, update loads to + // the values stored in the partition. + if (AllSameAndValid && !Insts.empty()) { + LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", " + << EndOffset << ")\n"); + SmallVector<PHINode *, 4> NewPHIs; + SSAUpdater SSA(&NewPHIs); + Insts.push_back(&AI); + BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType); + Promoter.run(Insts); + } + + // Step on to the next partition. + PartitionBegin = PartitionEnd; + if (PartitionBegin == AS.end()) + break; + BeginOffset = PartitionBegin->beginOffset(); + EndOffset = PartitionBegin->endOffset(); + } + return true; +} + /// Analyze an alloca for SROA. /// /// This analyzes the alloca to ensure we can reason about it, builds @@ -5494,6 +5598,11 @@ SROA::runOnAlloca(AllocaInst &AI) { if (AS.isEscaped()) return {Changed, CFGChanged}; + if (AS.isEscapedReadOnly()) { + Changed |= propagateStoredValuesToLoads(AI, AS); + return {Changed, CFGChanged}; + } + // Delete all the dead users of this alloca before splitting and rewriting it. for (Instruction *DeadUser : AS.getDeadUsers()) { // Free up everything used by this instruction. diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 3b701e6ca097..2b27150112ad 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -279,8 +279,6 @@ public: bool visit(Function &F); - bool isTriviallyScalarizable(Intrinsic::ID ID); - // InstVisitor methods. They return true if the instruction was scalarized, // false if nothing changed. bool visitInstruction(Instruction &I) { return false; } @@ -683,19 +681,6 @@ bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) { return true; } -bool ScalarizerVisitor::isTriviallyScalarizable(Intrinsic::ID ID) { - if (isTriviallyVectorizable(ID)) - return true; - // TODO: Move frexp to isTriviallyVectorizable. - // https://github.com/llvm/llvm-project/issues/112408 - switch (ID) { - case Intrinsic::frexp: - return true; - } - return Intrinsic::isTargetIntrinsic(ID) && - TTI->isTargetIntrinsicTriviallyScalarizable(ID); -} - /// If a call to a vector typed intrinsic function, split into a scalar call per /// element if possible for the intrinsic. bool ScalarizerVisitor::splitCall(CallInst &CI) { @@ -715,7 +700,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { Intrinsic::ID ID = F->getIntrinsicID(); - if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID)) + if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID, TTI)) return false; // unsigned NumElems = VT->getNumElements(); @@ -743,7 +728,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { // will only scalarize when the struct elements have the same bitness. if (!CurrVS || CurrVS->NumPacked != VS->NumPacked) return false; - if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, I)) + if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, I, TTI)) Tys.push_back(CurrVS->SplitTy); } } @@ -794,8 +779,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { Tys[0] = VS->RemainderTy; for (unsigned J = 0; J != NumArgs; ++J) { - if (isVectorIntrinsicWithScalarOpAtArg(ID, J) || - TTI->isTargetIntrinsicWithScalarOpAtArg(ID, J)) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) { ScalarCallOps.push_back(ScalarOperands[J]); } else { ScalarCallOps.push_back(Scattered[J][I]); @@ -1089,7 +1073,7 @@ bool ScalarizerVisitor::visitExtractValueInst(ExtractValueInst &EVI) { if (!F) return false; Intrinsic::ID ID = F->getIntrinsicID(); - if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID)) + if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID, TTI)) return false; // Note: Fall through means Operand is a`CallInst` and it is defined in // `isTriviallyScalarizable`. diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index d8ef450eeb9a..0712ff77151e 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -2990,9 +2990,11 @@ static bool collectUnswitchCandidates( /// into its equivalent where `Pred` is something that we support for injected /// invariants (so far it is limited to ult), LHS in canonicalized form is /// non-invariant and RHS is an invariant. -static void canonicalizeForInvariantConditionInjection( - ICmpInst::Predicate &Pred, Value *&LHS, Value *&RHS, BasicBlock *&IfTrue, - BasicBlock *&IfFalse, const Loop &L) { +static void canonicalizeForInvariantConditionInjection(CmpPredicate &Pred, + Value *&LHS, Value *&RHS, + BasicBlock *&IfTrue, + BasicBlock *&IfFalse, + const Loop &L) { if (!L.contains(IfTrue)) { Pred = ICmpInst::getInversePredicate(Pred); std::swap(IfTrue, IfFalse); @@ -3235,7 +3237,7 @@ static bool collectUnswitchCandidatesWithInjections( // other). for (auto *DTN = DT.getNode(Latch); L.contains(DTN->getBlock()); DTN = DTN->getIDom()) { - ICmpInst::Predicate Pred; + CmpPredicate Pred; Value *LHS = nullptr, *RHS = nullptr; BasicBlock *IfTrue = nullptr, *IfFalse = nullptr; auto *BB = DTN->getBlock(); diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 75585fcc8026..7d017095c88c 100644 --- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -78,6 +78,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/DebugCounter.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" @@ -93,6 +94,9 @@ using namespace PatternMatch; static const unsigned UnknownAddressSpace = std::numeric_limits<unsigned>::max(); +DEBUG_COUNTER(StraightLineStrengthReduceCounter, "slsr-counter", + "Controls whether rewriteCandidateWithBasis is executed."); + namespace { class StraightLineStrengthReduceLegacyPass : public FunctionPass { @@ -268,8 +272,8 @@ FunctionPass *llvm::createStraightLineStrengthReducePass() { bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, const Candidate &C) { return (Basis.Ins != C.Ins && // skip the same instruction - // They must have the same type too. Basis.Base == C.Base doesn't - // guarantee their types are the same (PR23975). + // They must have the same type too. Basis.Base == C.Base + // doesn't guarantee their types are the same (PR23975). Basis.Ins->getType() == C.Ins->getType() && // Basis must dominate C in order to rewrite C with respect to Basis. DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) && @@ -610,6 +614,9 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, void StraightLineStrengthReduce::rewriteCandidateWithBasis( const Candidate &C, const Candidate &Basis) { + if (!DebugCounter::shouldExecute(StraightLineStrengthReduceCounter)) + return; + assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base && C.Stride == Basis.Stride); // We run rewriteCandidateWithBasis on all candidates in a post-order, so the diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 01090b54e5af..b1f742b838f2 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -686,8 +686,8 @@ void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { /// Add a dummy PHI value as soon as we knew the new predecessor void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { for (PHINode &Phi : To->phis()) { - Value *Undef = UndefValue::get(Phi.getType()); - Phi.addIncoming(Undef, From); + Value *Poison = PoisonValue::get(Phi.getType()); + Phi.addIncoming(Poison, From); } AddedPhis[To].push_back(From); } diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index e4f4052e5e48..fe1b91267c90 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1912,8 +1912,8 @@ Value *llvm::emitPutS(Value *Str, IRBuilderBase &B, Type *IntTy = getIntTy(B, TLI); StringRef PutsName = TLI->getName(LibFunc_puts); - FunctionCallee PutS = getOrInsertLibFunc(M, *TLI, LibFunc_puts, IntTy, - B.getPtrTy()); + FunctionCallee PutS = + getOrInsertLibFunc(M, *TLI, LibFunc_puts, IntTy, B.getPtrTy()); inferNonMandatoryLibFuncAttrs(M, PutsName, *TLI); CallInst *CI = B.CreateCall(PutS, Str, PutsName); if (const Function *F = @@ -1970,9 +1970,9 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B, Type *SizeTTy = getSizeTTy(B, TLI); StringRef FWriteName = TLI->getName(LibFunc_fwrite); - FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fwrite, - SizeTTy, B.getPtrTy(), SizeTTy, - SizeTTy, File->getType()); + FunctionCallee F = + getOrInsertLibFunc(M, *TLI, LibFunc_fwrite, SizeTTy, B.getPtrTy(), + SizeTTy, SizeTTy, File->getType()); if (File->getType()->isPointerTy()) inferNonMandatoryLibFuncAttrs(M, FWriteName, *TLI); @@ -1994,8 +1994,8 @@ Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL, StringRef MallocName = TLI->getName(LibFunc_malloc); Type *SizeTTy = getSizeTTy(B, TLI); - FunctionCallee Malloc = getOrInsertLibFunc(M, *TLI, LibFunc_malloc, - B.getPtrTy(), SizeTTy); + FunctionCallee Malloc = + getOrInsertLibFunc(M, *TLI, LibFunc_malloc, B.getPtrTy(), SizeTTy); inferNonMandatoryLibFuncAttrs(M, MallocName, *TLI); CallInst *CI = B.CreateCall(Malloc, Num, MallocName); @@ -2084,8 +2084,8 @@ Value *llvm::emitHotColdNew(Value *Num, IRBuilderBase &B, return nullptr; StringRef Name = TLI->getName(NewFunc); - FunctionCallee Func = M->getOrInsertFunction(Name, B.getPtrTy(), - Num->getType(), B.getInt8Ty()); + FunctionCallee Func = + M->getOrInsertFunction(Name, B.getPtrTy(), Num->getType(), B.getInt8Ty()); inferNonMandatoryLibFuncAttrs(M, Name, *TLI); CallInst *CI = B.CreateCall(Func, {Num, B.getInt8(HotCold)}, Name); @@ -2104,9 +2104,8 @@ Value *llvm::emitHotColdNewNoThrow(Value *Num, Value *NoThrow, IRBuilderBase &B, return nullptr; StringRef Name = TLI->getName(NewFunc); - FunctionCallee Func = - M->getOrInsertFunction(Name, B.getPtrTy(), Num->getType(), - NoThrow->getType(), B.getInt8Ty()); + FunctionCallee Func = M->getOrInsertFunction( + Name, B.getPtrTy(), Num->getType(), NoThrow->getType(), B.getInt8Ty()); inferNonMandatoryLibFuncAttrs(M, Name, *TLI); CallInst *CI = B.CreateCall(Func, {Num, NoThrow, B.getInt8(HotCold)}, Name); @@ -2147,8 +2146,8 @@ Value *llvm::emitHotColdNewAlignedNoThrow(Value *Num, Value *Align, StringRef Name = TLI->getName(NewFunc); FunctionCallee Func = M->getOrInsertFunction( - Name, B.getPtrTy(), Num->getType(), Align->getType(), - NoThrow->getType(), B.getInt8Ty()); + Name, B.getPtrTy(), Num->getType(), Align->getType(), NoThrow->getType(), + B.getInt8Ty()); inferNonMandatoryLibFuncAttrs(M, Name, *TLI); CallInst *CI = B.CreateCall(Func, {Num, Align, NoThrow, B.getInt8(HotCold)}, Name); diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp index 17cba2e642a1..725a0eb97eae 100644 --- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -692,14 +692,14 @@ bool llvm::tryPromoteCall(CallBase &CB) { if (!VTableEntryLoad) return false; // Not a vtable entry load. Value *VTableEntryPtr = VTableEntryLoad->getPointerOperand(); - APInt VTableOffset(DL.getTypeSizeInBits(VTableEntryPtr->getType()), 0); + APInt VTableOffset(DL.getIndexTypeSizeInBits(VTableEntryPtr->getType()), 0); Value *VTableBasePtr = VTableEntryPtr->stripAndAccumulateConstantOffsets( DL, VTableOffset, /* AllowNonInbounds */ true); LoadInst *VTablePtrLoad = dyn_cast<LoadInst>(VTableBasePtr); if (!VTablePtrLoad) return false; // Not a vtable load. Value *Object = VTablePtrLoad->getPointerOperand(); - APInt ObjectOffset(DL.getTypeSizeInBits(Object->getType()), 0); + APInt ObjectOffset(DL.getIndexTypeSizeInBits(Object->getType()), 0); Value *ObjectBase = Object->stripAndAccumulateConstantOffsets( DL, ObjectOffset, /* AllowNonInbounds */ true); if (!(isa<AllocaInst>(ObjectBase) && ObjectOffset == 0)) @@ -710,9 +710,9 @@ bool llvm::tryPromoteCall(CallBase &CB) { BasicBlock::iterator BBI(VTablePtrLoad); Value *VTablePtr = FindAvailableLoadedValue( VTablePtrLoad, VTablePtrLoad->getParent(), BBI, 0, nullptr, nullptr); - if (!VTablePtr) + if (!VTablePtr || !VTablePtr->getType()->isPointerTy()) return false; // No vtable found. - APInt VTableOffsetGVBase(DL.getTypeSizeInBits(VTablePtr->getType()), 0); + APInt VTableOffsetGVBase(DL.getIndexTypeSizeInBits(VTablePtr->getType()), 0); Value *VTableGVBase = VTablePtr->stripAndAccumulateConstantOffsets( DL, VTableOffsetGVBase, /* AllowNonInbounds */ true); GlobalVariable *GV = dyn_cast<GlobalVariable>(VTableGVBase); diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index cb6a4e34c226..8863dff4482a 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -43,21 +43,16 @@ using namespace llvm; /// See comments in Cloning.h. BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix, Function *F, - ClonedCodeInfo *CodeInfo, - DebugInfoFinder *DIFinder) { + ClonedCodeInfo *CodeInfo) { BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F); NewBB->IsNewDbgInfoFormat = BB->IsNewDbgInfoFormat; if (BB->hasName()) NewBB->setName(BB->getName() + NameSuffix); bool hasCalls = false, hasDynamicAllocas = false, hasMemProfMetadata = false; - Module *TheModule = F ? F->getParent() : nullptr; // Loop over all instructions, and copy them over. for (const Instruction &I : *BB) { - if (DIFinder && TheModule) - DIFinder->processInstruction(*TheModule, I); - Instruction *NewInst = I.clone(); if (I.hasName()) NewInst->setName(I.getName() + NameSuffix); @@ -157,6 +152,118 @@ DISubprogram *llvm::CollectDebugInfoForCloning(const Function &F, return SPClonedWithinModule; } +bool llvm::BuildDebugInfoMDMap(DenseMap<const Metadata *, TrackingMDRef> &MD, + CloneFunctionChangeType Changes, + DebugInfoFinder &DIFinder, + DISubprogram *SPClonedWithinModule) { + bool ModuleLevelChanges = Changes > CloneFunctionChangeType::LocalChangesOnly; + if (Changes < CloneFunctionChangeType::DifferentModule && + DIFinder.subprogram_count() > 0) { + // Turn on module-level changes, since we need to clone (some of) the + // debug info metadata. + // + // FIXME: Metadata effectively owned by a function should be made + // local, and only that local metadata should be cloned. + ModuleLevelChanges = true; + + auto mapToSelfIfNew = [&MD](MDNode *N) { + // Avoid clobbering an existing mapping. + (void)MD.try_emplace(N, N); + }; + + // Avoid cloning types, compile units, and (other) subprograms. + for (DISubprogram *ISP : DIFinder.subprograms()) { + if (ISP != SPClonedWithinModule) + mapToSelfIfNew(ISP); + } + + // If a subprogram isn't going to be cloned skip its lexical blocks as well. + for (DIScope *S : DIFinder.scopes()) { + auto *LScope = dyn_cast<DILocalScope>(S); + if (LScope && LScope->getSubprogram() != SPClonedWithinModule) + mapToSelfIfNew(S); + } + + for (DICompileUnit *CU : DIFinder.compile_units()) + mapToSelfIfNew(CU); + + for (DIType *Type : DIFinder.types()) + mapToSelfIfNew(Type); + } else { + assert(!SPClonedWithinModule && + "Subprogram should be in DIFinder->subprogram_count()..."); + } + + return ModuleLevelChanges; +} + +void llvm::CloneFunctionMetadataInto(Function &NewFunc, const Function &OldFunc, + ValueToValueMapTy &VMap, + RemapFlags RemapFlag, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; + OldFunc.getAllMetadata(MDs); + for (auto MD : MDs) { + NewFunc.addMetadata(MD.first, *MapMetadata(MD.second, VMap, RemapFlag, + TypeMapper, Materializer)); + } +} + +void llvm::CloneFunctionBodyInto(Function &NewFunc, const Function &OldFunc, + ValueToValueMapTy &VMap, RemapFlags RemapFlag, + SmallVectorImpl<ReturnInst *> &Returns, + const char *NameSuffix, + ClonedCodeInfo *CodeInfo, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + if (OldFunc.isDeclaration()) + return; + + // Loop over all of the basic blocks in the function, cloning them as + // appropriate. Note that we save BE this way in order to handle cloning of + // recursive functions into themselves. + for (const BasicBlock &BB : OldFunc) { + + // Create a new basic block and copy instructions into it! + BasicBlock *CBB = + CloneBasicBlock(&BB, VMap, NameSuffix, &NewFunc, CodeInfo); + + // Add basic block mapping. + VMap[&BB] = CBB; + + // It is only legal to clone a function if a block address within that + // function is never referenced outside of the function. Given that, we + // want to map block addresses from the old function to block addresses in + // the clone. (This is different from the generic ValueMapper + // implementation, which generates an invalid blockaddress when + // cloning a function.) + if (BB.hasAddressTaken()) { + Constant *OldBBAddr = BlockAddress::get(const_cast<Function *>(&OldFunc), + const_cast<BasicBlock *>(&BB)); + VMap[OldBBAddr] = BlockAddress::get(&NewFunc, CBB); + } + + // Note return instructions for the caller. + if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator())) + Returns.push_back(RI); + } + + // Loop over all of the instructions in the new function, fixing up operand + // references as we go. This uses VMap to do all the hard work. + for (Function::iterator + BB = cast<BasicBlock>(VMap[&OldFunc.front()])->getIterator(), + BE = NewFunc.end(); + BB != BE; ++BB) + // Loop over all instructions, fixing each one as we find it, and any + // attached debug-info records. + for (Instruction &II : *BB) { + RemapInstruction(&II, VMap, RemapFlag, TypeMapper, Materializer); + RemapDbgRecordRange(II.getModule(), II.getDbgRecordRange(), VMap, + RemapFlag, TypeMapper, Materializer); + } +} + // Clone OldFunc into NewFunc, transforming the old arguments into references to // VMap values. void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, @@ -215,101 +322,16 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, DISubprogram *SPClonedWithinModule = CollectDebugInfoForCloning(*OldFunc, Changes, DIFinder); - // Loop over all of the basic blocks in the function, cloning them as - // appropriate. Note that we save BE this way in order to handle cloning of - // recursive functions into themselves. - for (const BasicBlock &BB : *OldFunc) { - - // Create a new basic block and copy instructions into it! - // NOTE: don't pass DIFinder because instructions' debug info was processed - // in ProcessSubprogramAttachment. This will be cleaned up further. - BasicBlock *CBB = - CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo, nullptr); - - // Add basic block mapping. - VMap[&BB] = CBB; - - // It is only legal to clone a function if a block address within that - // function is never referenced outside of the function. Given that, we - // want to map block addresses from the old function to block addresses in - // the clone. (This is different from the generic ValueMapper - // implementation, which generates an invalid blockaddress when - // cloning a function.) - if (BB.hasAddressTaken()) { - Constant *OldBBAddr = BlockAddress::get(const_cast<Function *>(OldFunc), - const_cast<BasicBlock *>(&BB)); - VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB); - } - - // Note return instructions for the caller. - if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator())) - Returns.push_back(RI); - } - - if (Changes < CloneFunctionChangeType::DifferentModule && - DIFinder.subprogram_count() > 0) { - // Turn on module-level changes, since we need to clone (some of) the - // debug info metadata. - // - // FIXME: Metadata effectively owned by a function should be made - // local, and only that local metadata should be cloned. - ModuleLevelChanges = true; - - auto mapToSelfIfNew = [&VMap](MDNode *N) { - // Avoid clobbering an existing mapping. - (void)VMap.MD().try_emplace(N, N); - }; - - // Avoid cloning types, compile units, and (other) subprograms. - SmallPtrSet<const DISubprogram *, 16> MappedToSelfSPs; - for (DISubprogram *ISP : DIFinder.subprograms()) { - if (ISP != SPClonedWithinModule) { - mapToSelfIfNew(ISP); - MappedToSelfSPs.insert(ISP); - } - } - - // If a subprogram isn't going to be cloned skip its lexical blocks as well. - for (DIScope *S : DIFinder.scopes()) { - auto *LScope = dyn_cast<DILocalScope>(S); - if (LScope && MappedToSelfSPs.count(LScope->getSubprogram())) - mapToSelfIfNew(S); - } - - for (DICompileUnit *CU : DIFinder.compile_units()) - mapToSelfIfNew(CU); - - for (DIType *Type : DIFinder.types()) - mapToSelfIfNew(Type); - } else { - assert(!SPClonedWithinModule && - "Subprogram should be in DIFinder->subprogram_count()..."); - } + ModuleLevelChanges = + BuildDebugInfoMDMap(VMap.MD(), Changes, DIFinder, SPClonedWithinModule); const auto RemapFlag = ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges; - // Duplicate the metadata that is attached to the cloned function. - // Subprograms/CUs/types that were already mapped to themselves won't be - // duplicated. - SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; - OldFunc->getAllMetadata(MDs); - for (auto MD : MDs) { - NewFunc->addMetadata(MD.first, *MapMetadata(MD.second, VMap, RemapFlag, - TypeMapper, Materializer)); - } - // Loop over all of the instructions in the new function, fixing up operand - // references as we go. This uses VMap to do all the hard work. - for (Function::iterator - BB = cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(), - BE = NewFunc->end(); - BB != BE; ++BB) - // Loop over all instructions, fixing each one as we find it, and any - // attached debug-info records. - for (Instruction &II : *BB) { - RemapInstruction(&II, VMap, RemapFlag, TypeMapper, Materializer); - RemapDbgRecordRange(II.getModule(), II.getDbgRecordRange(), VMap, - RemapFlag, TypeMapper, Materializer); - } + CloneFunctionMetadataInto(*NewFunc, *OldFunc, VMap, RemapFlag, TypeMapper, + Materializer); + + CloneFunctionBodyInto(*NewFunc, *OldFunc, VMap, RemapFlag, Returns, + NameSuffix, CodeInfo, TypeMapper, Materializer); // Only update !llvm.dbg.cu for DifferentModule (not CloneModule). In the // same module, the compile unit will already be listed (or not). When diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 6539f924c2ed..7ddb9e22c834 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -627,6 +627,24 @@ bool CodeExtractor::isEligible() const { return false; } } + // stacksave as input implies stackrestore in the outlined function. + // This can confuse prolog epilog insertion phase. + // stacksave's uses must not cross outlined function. + for (BasicBlock *BB : Blocks) { + for (Instruction &I : *BB) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + continue; + bool IsSave = II->getIntrinsicID() == Intrinsic::stacksave; + bool IsRestore = II->getIntrinsicID() == Intrinsic::stackrestore; + if (IsSave && any_of(II->users(), [&Blks = this->Blocks](User *U) { + return !definedInRegion(Blks, U); + })) + return false; + if (IsRestore && !definedInRegion(Blocks, II->getArgOperand(0))) + return false; + } + } return true; } @@ -935,6 +953,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::SanitizeMemory: case Attribute::SanitizeNumericalStability: case Attribute::SanitizeThread: + case Attribute::SanitizeType: case Attribute::SanitizeHWAddress: case Attribute::SanitizeMemTag: case Attribute::SanitizeRealtime: diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp index 47bb31905d1a..5b33edd51cff 100644 --- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp +++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp @@ -48,6 +48,21 @@ static void insertCall(Function &CurFn, StringRef Func, /*isVarArg=*/false)), {GV}, "", InsertionPt); Call->setDebugLoc(DL); + } else if (TargetTriple.isRISCV() || TargetTriple.isAArch64() || + TargetTriple.isLoongArch()) { + // On RISC-V, AArch64, and LoongArch, the `_mcount` function takes + // `__builtin_return_address(0)` as an argument since + // `__builtin_return_address(1)` is not available on these platforms. + Instruction *RetAddr = CallInst::Create( + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress), + ConstantInt::get(Type::getInt32Ty(C), 0), "", InsertionPt); + RetAddr->setDebugLoc(DL); + + FunctionCallee Fn = M.getOrInsertFunction( + Func, FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), + false)); + CallInst *Call = CallInst::Create(Fn, RetAddr, "", InsertionPt); + Call->setDebugLoc(DL); } else { FunctionCallee Fn = M.getOrInsertFunction(Func, Type::getVoidTy(C)); CallInst *Call = CallInst::Create(Fn, "", InsertionPt); diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp index cf1a8b4af112..2af447aadce2 100644 --- a/llvm/lib/Transforms/Utils/Evaluator.cpp +++ b/llvm/lib/Transforms/Utils/Evaluator.cpp @@ -253,40 +253,17 @@ Evaluator::getCalleeWithFormalArgs(CallBase &CB, bool Evaluator::getFormalParams(CallBase &CB, Function *F, SmallVectorImpl<Constant *> &Formals) { - if (!F) - return false; - auto *FTy = F->getFunctionType(); - if (FTy->getNumParams() > CB.arg_size()) { - LLVM_DEBUG(dbgs() << "Too few arguments for function.\n"); + if (FTy != CB.getFunctionType()) { + LLVM_DEBUG(dbgs() << "Signature mismatch.\n"); return false; } - auto ArgI = CB.arg_begin(); - for (Type *PTy : FTy->params()) { - auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), PTy, DL); - if (!ArgC) { - LLVM_DEBUG(dbgs() << "Can not convert function argument.\n"); - return false; - } - Formals.push_back(ArgC); - ++ArgI; - } + for (Value *Arg : CB.args()) + Formals.push_back(getVal(Arg)); return true; } -/// If call expression contains bitcast then we may need to cast -/// evaluated return value to a type of the call expression. -Constant *Evaluator::castCallResultIfNeeded(Type *ReturnType, Constant *RV) { - if (!RV || RV->getType() == ReturnType) - return RV; - - RV = ConstantFoldLoadThroughBitcast(RV, ReturnType, DL); - if (!RV) - LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n"); - return RV; -} - /// Evaluate all instructions in block BB, returning true if successful, false /// if we can't evaluate it. NewBB returns the next BB that control flows into, /// or null upon return. StrippedPointerCastsForAliasAnalysis is set to true if @@ -520,9 +497,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB, if (Callee->isDeclaration()) { // If this is a function we can constant fold, do it. if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) { - InstResult = castCallResultIfNeeded(CB.getType(), C); - if (!InstResult) - return false; + InstResult = C; LLVM_DEBUG(dbgs() << "Constant folded function call. Result: " << *InstResult << "\n"); } else { @@ -544,10 +519,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB, return false; } ValueStack.pop_back(); - InstResult = castCallResultIfNeeded(CB.getType(), RetVal); - if (RetVal && !InstResult) - return false; - + InstResult = RetVal; if (InstResult) { LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: " << *InstResult << "\n\n"); diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp index 760341a29d8c..6d4026e8209d 100644 --- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp +++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp @@ -83,6 +83,13 @@ int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const { return 0; } +int FunctionComparator::cmpConstantRanges(const ConstantRange &L, + const ConstantRange &R) const { + if (int Res = cmpAPInts(L.getLower(), R.getLower())) + return Res; + return cmpAPInts(L.getUpper(), R.getUpper()); +} + int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const { // Floats are ordered first by semantics (i.e. float, double, half, etc.), // then by value interpreted as a bitstring (aka APInt). @@ -147,12 +154,22 @@ int FunctionComparator::cmpAttrs(const AttributeList L, if (LA.getKindAsEnum() != RA.getKindAsEnum()) return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum()); - const ConstantRange &LCR = LA.getRange(); - const ConstantRange &RCR = RA.getRange(); - if (int Res = cmpAPInts(LCR.getLower(), RCR.getLower())) + if (int Res = cmpConstantRanges(LA.getRange(), RA.getRange())) return Res; - if (int Res = cmpAPInts(LCR.getUpper(), RCR.getUpper())) + continue; + } else if (LA.isConstantRangeListAttribute() && + RA.isConstantRangeListAttribute()) { + if (LA.getKindAsEnum() != RA.getKindAsEnum()) + return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum()); + + ArrayRef<ConstantRange> CRL = LA.getValueAsConstantRangeList(); + ArrayRef<ConstantRange> CRR = RA.getValueAsConstantRangeList(); + if (int Res = cmpNumbers(CRL.size(), CRR.size())) return Res; + + for (const auto &[L, R] : zip(CRL, CRR)) + if (int Res = cmpConstantRanges(L, R)) + return Res; continue; } if (LA < RA) @@ -441,9 +458,7 @@ int FunctionComparator::cmpConstants(const Constant *L, if (InRangeL) { if (!InRangeR) return 1; - if (int Res = cmpAPInts(InRangeL->getLower(), InRangeR->getLower())) - return Res; - if (int Res = cmpAPInts(InRangeL->getUpper(), InRangeR->getUpper())) + if (int Res = cmpConstantRanges(*InRangeL, *InRangeR)) return Res; } else if (InRangeR) { return -1; diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index cdc3f0308fe5..1e4061cb0771 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1279,10 +1279,10 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, // | for.body <---- (md2) // |_______| |______| if (Instruction *TI = BB->getTerminator()) - if (TI->hasMetadata(LLVMContext::MD_loop)) + if (TI->hasNonDebugLocLoopMetadata()) for (BasicBlock *Pred : predecessors(BB)) if (Instruction *PredTI = Pred->getTerminator()) - if (PredTI->hasMetadata(LLVMContext::MD_loop)) + if (PredTI->hasNonDebugLocLoopMetadata()) return false; if (BBKillable) @@ -1345,12 +1345,15 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, } } - // If the unconditional branch we replaced contains llvm.loop metadata, we - // add the metadata to the branch instructions in the predecessors. + // If the unconditional branch we replaced contains non-debug llvm.loop + // metadata, we add the metadata to the branch instructions in the + // predecessors. if (Instruction *TI = BB->getTerminator()) - if (MDNode *LoopMD = TI->getMetadata(LLVMContext::MD_loop)) + if (TI->hasNonDebugLocLoopMetadata()) { + MDNode *LoopMD = TI->getMetadata(LLVMContext::MD_loop); for (BasicBlock *Pred : predecessors(BB)) Pred->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopMD); + } if (BBKillable) { // Everything that jumped to BB now goes to Succ. @@ -3305,6 +3308,9 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, return Changed; } +// FIXME: https://github.com/llvm/llvm-project/issues/121495 +// Once external callers of this function are removed, either inline into +// combineMetadataForCSE, or internalize and remove KnownIDs parameter. void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsigned> KnownIDs, bool DoesKMove) { SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; @@ -3317,6 +3323,10 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, switch (Kind) { default: + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Change to removing only explicitly listed other metadata, and assert + // on unknown metadata, to avoid inadvertently dropping newly added + // metadata types. K->setMetadata(Kind, nullptr); // Remove unknown metadata break; case LLVMContext::MD_dbg: @@ -3376,6 +3386,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, K->setMetadata(Kind, MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); break; + case LLVMContext::MD_memprof: + K->setMetadata(Kind, MDNode::getMergedMemProfMetadata(KMD, JMD)); + break; + case LLVMContext::MD_callsite: + K->setMetadata(Kind, MDNode::getMergedCallsiteMetadata(KMD, JMD)); + break; case LLVMContext::MD_preserve_access_index: // Preserve !preserve.access.index in K. break; @@ -3439,7 +3455,9 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J, LLVMContext::MD_nontemporal, LLVMContext::MD_noundef, LLVMContext::MD_mmra, - LLVMContext::MD_noalias_addrspace}; + LLVMContext::MD_noalias_addrspace, + LLVMContext::MD_memprof, + LLVMContext::MD_callsite}; combineMetadata(K, J, KnownIDs, KDominatesJ); } diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index 3cbde39b30b4..9a24c1b0d03d 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -378,7 +378,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, return; } - CmpInst::Predicate Pred; + CmpPredicate Pred; if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal)))) return; diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 04042e71a2b8..fffff295ba92 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -171,14 +171,14 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // Users in the OrigPreHeader need to use the value to which the // original definitions are mapped and anything else can be handled by // the SSAUpdater. To avoid adding PHINodes, check if the value is - // available in UserBB, if not substitute undef. + // available in UserBB, if not substitute poison. Value *NewVal; if (UserBB == OrigPreheader) NewVal = OrigPreHeaderVal; else if (SSA.HasValueForBlock(UserBB)) NewVal = SSA.GetValueInMiddleOfBlock(UserBB); else - NewVal = UndefValue::get(OrigHeaderVal->getType()); + NewVal = PoisonValue::get(OrigHeaderVal->getType()); DbgValue->replaceVariableLocationOp(OrigHeaderVal, NewVal); } @@ -194,14 +194,14 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // Users in the OrigPreHeader need to use the value to which the // original definitions are mapped and anything else can be handled by // the SSAUpdater. To avoid adding PHINodes, check if the value is - // available in UserBB, if not substitute undef. + // available in UserBB, if not substitute poison. Value *NewVal; if (UserBB == OrigPreheader) NewVal = OrigPreHeaderVal; else if (SSA.HasValueForBlock(UserBB)) NewVal = SSA.GetValueInMiddleOfBlock(UserBB); else - NewVal = UndefValue::get(OrigHeaderVal->getType()); + NewVal = PoisonValue::get(OrigHeaderVal->getType()); DVR->replaceVariableLocationOp(OrigHeaderVal, NewVal); } } diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 44fdfe530178..d8298646e18d 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -777,8 +777,8 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", - "Canonicalize natural loops", false, false) +INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", + false, true) // Publicly exposed interface to pass... char &llvm::LoopSimplifyID = LoopSimplify::ID; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 70047273c3b9..45915c10107b 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1208,6 +1208,23 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src, return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select"); } +Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src, + const RecurrenceDescriptor &Desc) { + assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind( + Desc.getRecurrenceKind()) && + "Unexpected reduction kind"); + Value *StartVal = Desc.getRecurrenceStartValue(); + Value *Sentinel = Desc.getSentinelValue(); + Value *MaxRdx = Src->getType()->isVectorTy() + ? Builder.CreateIntMaxReduce(Src, true) + : Src; + // Correct the final reduction result back to the start value if the maximum + // reduction is sentinel value. + Value *Cmp = + Builder.CreateCmp(CmpInst::ICMP_NE, MaxRdx, Sentinel, "rdx.select.cmp"); + return Builder.CreateSelect(Cmp, MaxRdx, StartVal, "rdx.select"); +} + Value *llvm::getReductionIdentity(Intrinsic::ID RdxID, Type *Ty, FastMathFlags Flags) { bool Negative = false; @@ -1315,6 +1332,8 @@ Value *llvm::createReduction(IRBuilderBase &B, RecurKind RK = Desc.getRecurrenceKind(); if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) return createAnyOfReduction(B, Src, Desc, OrigPhi); + if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) + return createFindLastIVReduction(B, Src, Desc); return createSimpleReduction(B, Src, RK); } diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 8f8c40a4e73b..5ee551e6f0cc 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" using namespace llvm; @@ -278,6 +279,9 @@ bool runImpl(LoopInfo *LI, LoopAccessInfoManager &LAIs, DominatorTree *DT, if (!LAI.hasConvergentOp() && (LAI.getNumRuntimePointerChecks() || !LAI.getPSE().getPredicate().isAlwaysTrue())) { + if (!L->isLCSSAForm(*DT)) + formLCSSARecursively(*L, *DT, LI, SE); + LoopVersioning LVer(LAI, LAI.getRuntimePointerChecking()->getChecks(), L, LI, DT, SE); LVer.versionLoop(); diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 4225e7e80fda..81aa7ce1cfe6 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -147,6 +147,16 @@ static bool refineInstruction(SCCPSolver &Solver, Changed = true; } } + } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&Inst)) { + if (GEP->hasNoUnsignedWrap() || !GEP->hasNoUnsignedSignedWrap()) + return false; + + if (all_of(GEP->indices(), + [&](Value *V) { return GetRange(V).isAllNonNegative(); })) { + GEP->setNoWrapFlags(GEP->getNoWrapFlags() | + GEPNoWrapFlags::noUnsignedWrap()); + Changed = true; + } } return Changed; diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 597d470f18ff..4bf4acd6330f 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -412,9 +412,13 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) { if (StoreInst *SI = dyn_cast<StoreInst>(User)) { updateDebugInfo(SI); SSA.AddAvailableValue(BB, SI->getOperand(0)); - } else + } else if (auto *AI = dyn_cast<AllocaInst>(User)) { + // We treat AllocaInst as a store of an getValueToUseForAlloca value. + SSA.AddAvailableValue(BB, getValueToUseForAlloca(AI)); + } else { // Otherwise it is a load, queue it to rewrite as a live-in load. LiveInLoads.push_back(cast<LoadInst>(User)); + } BlockUses.clear(); continue; } @@ -422,7 +426,7 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) { // Otherwise, check to see if this block is all loads. bool HasStore = false; for (Instruction *I : BlockUses) { - if (isa<StoreInst>(I)) { + if (isa<StoreInst>(I) || isa<AllocaInst>(I)) { HasStore = true; break; } @@ -468,6 +472,12 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) { // Remember that this is the active value in the block. StoredValue = SI->getOperand(0); + } else if (auto *AI = dyn_cast<AllocaInst>(&I)) { + // Check if this an alloca, in which case we treat it as a store of + // getValueToUseForAlloca. + if (!isInstInList(AI, Insts)) + continue; + StoredValue = getValueToUseForAlloca(AI); } } diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 791d52882397..0bc752a92340 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1816,7 +1816,7 @@ bool SCEVExpander::hasRelatedExistingExpansion(const SCEV *S, // Look for suitable value in simple conditions at the loop exits. for (BasicBlock *BB : ExitingBlocks) { - ICmpInst::Predicate Pred; + CmpPredicate Pred; Instruction *LHS, *RHS; if (!match(BB->getTerminator(), diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index c7e814bced57..febc5682c212 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -285,7 +285,7 @@ class SimplifyCFGOpt { bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, IRBuilder<> &Builder); - bool hoistCommonCodeFromSuccessors(Instruction *TI, bool EqTermsOnly); + bool hoistCommonCodeFromSuccessors(Instruction *TI, bool AllInstsEqOnly); bool hoistSuccIdenticalTerminatorToSwitchOrIf( Instruction *TI, Instruction *I1, SmallVectorImpl<Instruction *> &OtherSuccTIs); @@ -1772,13 +1772,84 @@ static bool isSafeCheapLoadStore(const Instruction *I, getLoadStoreAlignment(I) < Value::MaximumAlignment; } +namespace { + +// LockstepReverseIterator - Iterates through instructions +// in a set of blocks in reverse order from the first non-terminator. +// For example (assume all blocks have size n): +// LockstepReverseIterator I([B1, B2, B3]); +// *I-- = [B1[n], B2[n], B3[n]]; +// *I-- = [B1[n-1], B2[n-1], B3[n-1]]; +// *I-- = [B1[n-2], B2[n-2], B3[n-2]]; +// ... +class LockstepReverseIterator { + ArrayRef<BasicBlock *> Blocks; + SmallVector<Instruction *, 4> Insts; + bool Fail; + +public: + LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) { + reset(); + } + + void reset() { + Fail = false; + Insts.clear(); + for (auto *BB : Blocks) { + Instruction *Inst = BB->getTerminator(); + for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);) + Inst = Inst->getPrevNode(); + if (!Inst) { + // Block wasn't big enough. + Fail = true; + return; + } + Insts.push_back(Inst); + } + } + + bool isValid() const { return !Fail; } + + void operator--() { + if (Fail) + return; + for (auto *&Inst : Insts) { + for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);) + Inst = Inst->getPrevNode(); + // Already at beginning of block. + if (!Inst) { + Fail = true; + return; + } + } + } + + void operator++() { + if (Fail) + return; + for (auto *&Inst : Insts) { + for (Inst = Inst->getNextNode(); Inst && isa<DbgInfoIntrinsic>(Inst);) + Inst = Inst->getNextNode(); + // Already at end of block. + if (!Inst) { + Fail = true; + return; + } + } + } + + ArrayRef<Instruction *> operator*() const { return Insts; } +}; + +} // end anonymous namespace + /// Hoist any common code in the successor blocks up into the block. This -/// function guarantees that BB dominates all successors. If EqTermsOnly is -/// given, only perform hoisting in case both blocks only contain a terminator. -/// In that case, only the original BI will be replaced and selects for PHIs are -/// added. +/// function guarantees that BB dominates all successors. If AllInstsEqOnly is +/// given, only perform hoisting in case all successors blocks contain matching +/// instructions only. In that case, all instructions can be hoisted and the +/// original branch will be replaced and selects for PHIs are added. bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI, - bool EqTermsOnly) { + bool AllInstsEqOnly) { // This does very trivial matching, with limited scanning, to find identical // instructions in the two blocks. In particular, we don't want to get into // O(N1*N2*...) situations here where Ni are the sizes of these successors. As @@ -1807,17 +1878,35 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI, SuccIterPairs.push_back(SuccIterPair(SuccItr, 0)); } - // Check if only hoisting terminators is allowed. This does not add new - // instructions to the hoist location. - if (EqTermsOnly) { - // Skip any debug intrinsics, as they are free to hoist. - for (auto &SuccIter : make_first_range(SuccIterPairs)) { - auto *INonDbg = &*skipDebugIntrinsics(SuccIter); - if (!INonDbg->isTerminator()) - return false; + if (AllInstsEqOnly) { + // Check if all instructions in the successor blocks match. This allows + // hoisting all instructions and removing the blocks we are hoisting from, + // so does not add any new instructions. + SmallVector<BasicBlock *> Succs = to_vector(successors(BB)); + // Check if sizes and terminators of all successors match. + bool AllSame = none_of(Succs, [&Succs](BasicBlock *Succ) { + Instruction *Term0 = Succs[0]->getTerminator(); + Instruction *Term = Succ->getTerminator(); + return !Term->isSameOperationAs(Term0) || + !equal(Term->operands(), Term0->operands()) || + Succs[0]->size() != Succ->size(); + }); + if (!AllSame) + return false; + if (AllSame) { + LockstepReverseIterator LRI(Succs); + while (LRI.isValid()) { + Instruction *I0 = (*LRI)[0]; + if (any_of(*LRI, [I0](Instruction *I) { + return !areIdenticalUpToCommutativity(I0, I); + })) { + return false; + } + --LRI; + } } - // Now we know that we only need to hoist debug intrinsics and the - // terminator. Let the loop below handle those 2 cases. + // Now we know that all instructions in all successors can be hoisted. Let + // the loop below handle the hoisting. } // Count how many instructions were not hoisted so far. There's a limit on how @@ -2350,81 +2439,6 @@ static void sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) { } } -namespace { - - // LockstepReverseIterator - Iterates through instructions - // in a set of blocks in reverse order from the first non-terminator. - // For example (assume all blocks have size n): - // LockstepReverseIterator I([B1, B2, B3]); - // *I-- = [B1[n], B2[n], B3[n]]; - // *I-- = [B1[n-1], B2[n-1], B3[n-1]]; - // *I-- = [B1[n-2], B2[n-2], B3[n-2]]; - // ... - class LockstepReverseIterator { - ArrayRef<BasicBlock*> Blocks; - SmallVector<Instruction*,4> Insts; - bool Fail; - - public: - LockstepReverseIterator(ArrayRef<BasicBlock*> Blocks) : Blocks(Blocks) { - reset(); - } - - void reset() { - Fail = false; - Insts.clear(); - for (auto *BB : Blocks) { - Instruction *Inst = BB->getTerminator(); - for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);) - Inst = Inst->getPrevNode(); - if (!Inst) { - // Block wasn't big enough. - Fail = true; - return; - } - Insts.push_back(Inst); - } - } - - bool isValid() const { - return !Fail; - } - - void operator--() { - if (Fail) - return; - for (auto *&Inst : Insts) { - for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);) - Inst = Inst->getPrevNode(); - // Already at beginning of block. - if (!Inst) { - Fail = true; - return; - } - } - } - - void operator++() { - if (Fail) - return; - for (auto *&Inst : Insts) { - for (Inst = Inst->getNextNode(); Inst && isa<DbgInfoIntrinsic>(Inst);) - Inst = Inst->getNextNode(); - // Already at end of block. - if (!Inst) { - Fail = true; - return; - } - } - } - - ArrayRef<Instruction*> operator * () const { - return Insts; - } - }; - -} // end anonymous namespace - /// Check whether BB's predecessors end with unconditional branches. If it is /// true, sink any common code from the predecessors to BB. static bool sinkCommonCodeFromPredecessors(BasicBlock *BB, @@ -6517,8 +6531,8 @@ SwitchLookupTable::SwitchLookupTable( uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue(); TableContents[Idx] = CaseRes; - if (CaseRes != SingleValue) - SingleValue = nullptr; + if (SingleValue && !isa<PoisonValue>(CaseRes) && CaseRes != SingleValue) + SingleValue = isa<PoisonValue>(SingleValue) ? CaseRes : nullptr; } // Fill in any holes in the table with the default result. @@ -6531,7 +6545,10 @@ SwitchLookupTable::SwitchLookupTable( TableContents[I] = DefaultValue; } - if (DefaultValue != SingleValue) + // If the default value is poison, all the holes are poison. + bool DefaultValueIsPoison = isa<PoisonValue>(DefaultValue); + + if (DefaultValue != SingleValue && !DefaultValueIsPoison) SingleValue = nullptr; } @@ -6555,6 +6572,16 @@ SwitchLookupTable::SwitchLookupTable( // Check if there is the same distance between two consecutive values. for (uint64_t I = 0; I < TableSize; ++I) { ConstantInt *ConstVal = dyn_cast<ConstantInt>(TableContents[I]); + + if (!ConstVal && isa<PoisonValue>(TableContents[I])) { + // This is an poison, so it's (probably) a lookup table hole. + // To prevent any regressions from before we switched to using poison as + // the default value, holes will fall back to using the first value. + // This can be removed once we add proper handling for poisons in lookup + // tables. + ConstVal = dyn_cast<ConstantInt>(Values[0].second); + } + if (!ConstVal) { // This is an undef. We could deal with it, but undefs in lookup tables // are very seldom. It's probably not worth the additional complexity. @@ -6989,8 +7016,8 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // If the table has holes but the default destination doesn't produce any // constant results, the lookup table entries corresponding to the holes will - // contain undefined values. - bool AllHolesAreUndefined = TableHasHoles && !HasDefaultResults; + // contain poison. + bool AllHolesArePoison = TableHasHoles && !HasDefaultResults; // If the default destination doesn't produce a constant result but is still // reachable, and the lookup table has holes, we need to use a mask to @@ -6998,7 +7025,7 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // to the default case. // The mask is unnecessary if the table has holes but the default destination // is unreachable, as in that case the holes must also be unreachable. - bool NeedMask = AllHolesAreUndefined && DefaultIsReachable; + bool NeedMask = AllHolesArePoison && DefaultIsReachable; if (NeedMask) { // As an extra penalty for the validity test we require more cases. if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark). @@ -7143,9 +7170,11 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, for (PHINode *PHI : PHIs) { const ResultListTy &ResultList = ResultLists[PHI]; + Type *ResultType = ResultList.begin()->second->getType(); + // Use any value to fill the lookup table holes. Constant *DV = - AllHolesAreUndefined ? ResultLists[PHI][0].second : DefaultResults[PHI]; + AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI]; StringRef FuncName = Fn->getName(); SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV, DL, FuncName); @@ -7474,9 +7503,6 @@ static bool simplifySwitchOfCmpIntrinsic(SwitchInst *SI, IRBuilderBase &Builder, /// IncomingValue and add it in the Wrapper so isEqual can do O(1) checking /// of the incoming values. struct SwitchSuccWrapper { - // Keep so we can use SwitchInst::setSuccessor to do the replacement. It won't - // be important to equality though. - unsigned SuccNum; BasicBlock *Dest; DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> *PhiPredIVs; }; @@ -7563,6 +7589,7 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, SmallPtrSet<PHINode *, 8> Phis; SmallPtrSet<BasicBlock *, 8> Seen; DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> PhiPredIVs; + DenseMap<BasicBlock *, SmallVector<unsigned, 4>> BBToSuccessorIndexes; SmallVector<SwitchSuccWrapper> Cases; Cases.reserve(SI->getNumSuccessors()); @@ -7575,8 +7602,9 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, continue; // FIXME: This case needs some extra care because the terminators other than - // SI need to be updated. - if (BB->hasNPredecessorsOrMore(2)) + // SI need to be updated. For now, consider only backedges to the SI. + if (BB->hasNPredecessorsOrMore(4) || + BB->getUniquePredecessor() != SI->getParent()) continue; // FIXME: Relax that the terminator is a BranchInst by checking for equality @@ -7591,8 +7619,11 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, for (BasicBlock *Succ : BI->successors()) for (PHINode &Phi : Succ->phis()) Phis.insert(&Phi); + // Add the successor only if not previously visited. + Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs}); } - Cases.emplace_back(SwitchSuccWrapper{I, BB, &PhiPredIVs}); + + BBToSuccessorIndexes[BB].emplace_back(I); } // Precompute a data structure to improve performance of isEqual for @@ -7627,7 +7658,9 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, // We know that SI's parent BB no longer dominates the old case successor // since we are making it dead. Updates.push_back({DominatorTree::Delete, SI->getParent(), SSW.Dest}); - SI->setSuccessor(SSW.SuccNum, (*It)->Dest); + const auto &Successors = BBToSuccessorIndexes.at(SSW.Dest); + for (unsigned Idx : Successors) + SI->setSuccessor(Idx, (*It)->Dest); MadeChange = true; } } diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index 7fca1a6aa526..f05d32d980e5 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -2164,16 +2164,14 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef, !NarrowDefRHS->isNonNegative()) return; - auto UpdateRangeFromCondition = [&] (Value *Condition, - bool TrueDest) { - CmpInst::Predicate Pred; + auto UpdateRangeFromCondition = [&](Value *Condition, bool TrueDest) { + CmpPredicate Pred; Value *CmpRHS; if (!match(Condition, m_ICmp(Pred, m_Specific(NarrowDefLHS), m_Value(CmpRHS)))) return; - CmpInst::Predicate P = - TrueDest ? Pred : CmpInst::getInversePredicate(Pred); + CmpPredicate P = TrueDest ? Pred : ICmpInst::getInverseCmpPredicate(Pred); auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS)); auto CmpConstrainedLHSRange = diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index d85e0d994660..737818b7825c 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -397,9 +397,8 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, // We have enough information to now generate the memcpy call to do the // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - B.CreateMemCpy( - CpyDst, Align(1), Src, Align(1), - ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1)); + B.CreateMemCpy(CpyDst, Align(1), Src, Align(1), + TLI->getAsSizeT(Len + 1, *B.GetInsertBlock()->getModule())); return Dst; } @@ -590,26 +589,21 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) { if (Len1 && Len2) { return copyFlags( *CI, emitMemCmp(Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), - std::min(Len1, Len2)), + TLI->getAsSizeT(std::min(Len1, Len2), *CI->getModule()), B, DL, TLI)); } // strcmp to memcmp if (!HasStr1 && HasStr2) { if (canTransformToMemCmp(CI, Str1P, Len2, DL)) - return copyFlags( - *CI, - emitMemCmp(Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), - B, DL, TLI)); + return copyFlags(*CI, emitMemCmp(Str1P, Str2P, + TLI->getAsSizeT(Len2, *CI->getModule()), + B, DL, TLI)); } else if (HasStr1 && !HasStr2) { if (canTransformToMemCmp(CI, Str2P, Len1, DL)) - return copyFlags( - *CI, - emitMemCmp(Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), - B, DL, TLI)); + return copyFlags(*CI, emitMemCmp(Str1P, Str2P, + TLI->getAsSizeT(Len1, *CI->getModule()), + B, DL, TLI)); } annotateNonNullNoUndefBasedOnAccess(CI, {0, 1}); @@ -676,19 +670,15 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) { if (!HasStr1 && HasStr2) { Len2 = std::min(Len2, Length); if (canTransformToMemCmp(CI, Str1P, Len2, DL)) - return copyFlags( - *CI, - emitMemCmp(Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), - B, DL, TLI)); + return copyFlags(*CI, emitMemCmp(Str1P, Str2P, + TLI->getAsSizeT(Len2, *CI->getModule()), + B, DL, TLI)); } else if (HasStr1 && !HasStr2) { Len1 = std::min(Len1, Length); if (canTransformToMemCmp(CI, Str2P, Len1, DL)) - return copyFlags( - *CI, - emitMemCmp(Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), - B, DL, TLI)); + return copyFlags(*CI, emitMemCmp(Str1P, Str2P, + TLI->getAsSizeT(Len1, *CI->getModule()), + B, DL, TLI)); } return nullptr; @@ -722,15 +712,13 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) { // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. - CallInst *NewCI = - B.CreateMemCpy(Dst, Align(1), Src, Align(1), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len)); + CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), + TLI->getAsSizeT(Len, *CI->getModule())); mergeAttributesAndFlags(NewCI, *CI); return Dst; } Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) { - Function *Callee = CI->getCalledFunction(); Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); // stpcpy(d,s) -> strcpy(d,s) if the result is not used. @@ -749,10 +737,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) { else return nullptr; - Type *PT = Callee->getFunctionType()->getParamType(0); - Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len); + Value *LenV = TLI->getAsSizeT(Len, *CI->getModule()); Value *DstEnd = B.CreateInBoundsGEP( - B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1)); + B.getInt8Ty(), Dst, TLI->getAsSizeT(Len - 1, *CI->getModule())); // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. @@ -819,13 +806,11 @@ Value *LibCallSimplifier::optimizeStrLCpy(CallInst *CI, IRBuilderBase &B) { return ConstantInt::get(CI->getType(), 0); } - Function *Callee = CI->getCalledFunction(); - Type *PT = Callee->getFunctionType()->getParamType(0); // Transform strlcpy(D, S, N) to memcpy(D, S, N') where N' is the lower // bound on strlen(S) + 1 and N, optionally followed by a nul store to // D[N' - 1] if necessary. CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), - ConstantInt::get(DL.getIntPtrType(PT), NBytes)); + TLI->getAsSizeT(NBytes, *CI->getModule())); mergeAttributesAndFlags(NewCI, *CI); if (!NulTerm) { @@ -844,7 +829,6 @@ Value *LibCallSimplifier::optimizeStrLCpy(CallInst *CI, IRBuilderBase &B) { // otherwise. Value *LibCallSimplifier::optimizeStringNCpy(CallInst *CI, bool RetEnd, IRBuilderBase &B) { - Function *Callee = CI->getCalledFunction(); Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); Value *Size = CI->getArgOperand(2); @@ -921,11 +905,10 @@ Value *LibCallSimplifier::optimizeStringNCpy(CallInst *CI, bool RetEnd, /*M=*/nullptr, /*AddNull=*/false); } - Type *PT = Callee->getFunctionType()->getParamType(0); // st{p,r}ncpy(D, S, N) -> memcpy(align 1 D, align 1 S, N) when both // S and N are constant. CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), - ConstantInt::get(DL.getIntPtrType(PT), N)); + TLI->getAsSizeT(N, *CI->getModule())); mergeAttributesAndFlags(NewCI, *CI); if (!RetEnd) return Dst; @@ -3432,10 +3415,9 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, return nullptr; // we found a format specifier, bail out. // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1) - B.CreateMemCpy( - Dest, Align(1), CI->getArgOperand(1), Align(1), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), - FormatStr.size() + 1)); // Copy the null byte. + B.CreateMemCpy(Dest, Align(1), CI->getArgOperand(1), Align(1), + // Copy the null byte. + TLI->getAsSizeT(FormatStr.size() + 1, *CI->getModule())); return ConstantInt::get(CI->getType(), FormatStr.size()); } @@ -3470,9 +3452,8 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, uint64_t SrcLen = GetStringLength(CI->getArgOperand(2)); if (SrcLen) { - B.CreateMemCpy( - Dest, Align(1), CI->getArgOperand(2), Align(1), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), SrcLen)); + B.CreateMemCpy(Dest, Align(1), CI->getArgOperand(2), Align(1), + TLI->getAsSizeT(SrcLen, *CI->getModule())); // Returns total number of characters written without null-character. return ConstantInt::get(CI->getType(), SrcLen - 1); } else if (Value *V = emitStpCpy(Dest, CI->getArgOperand(2), B, TLI)) { @@ -3570,11 +3551,8 @@ Value *LibCallSimplifier::emitSnPrintfMemCpy(CallInst *CI, Value *StrArg, Value *DstArg = CI->getArgOperand(0); if (NCopy && StrArg) // Transform the call to lvm.memcpy(dst, fmt, N). - copyFlags( - *CI, - B.CreateMemCpy( - DstArg, Align(1), StrArg, Align(1), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), NCopy))); + copyFlags(*CI, B.CreateMemCpy(DstArg, Align(1), StrArg, Align(1), + TLI->getAsSizeT(NCopy, *CI->getModule()))); if (N > Str.size()) // Return early when the whole format string, including the final nul, @@ -3690,11 +3668,9 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, if (FormatStr.contains('%')) return nullptr; // We found a format specifier. - unsigned SizeTBits = TLI->getSizeTSize(*CI->getModule()); - Type *SizeTTy = IntegerType::get(CI->getContext(), SizeTBits); return copyFlags( *CI, emitFWrite(CI->getArgOperand(1), - ConstantInt::get(SizeTTy, FormatStr.size()), + TLI->getAsSizeT(FormatStr.size(), *CI->getModule()), CI->getArgOperand(0), B, DL, TLI)); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index f1568781252c..cb0b4641b649 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -666,7 +666,6 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { // Check whether we are able to set up outer loop induction. if (!setupOuterLoopInductions()) { reportVectorizationFailure("Unsupported outer loop Phi(s)", - "Unsupported outer loop Phi(s)", "UnsupportedPhi", ORE, TheLoop); if (DoExtraAnalysis) Result = false; @@ -927,7 +926,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { auto *SE = PSE.getSE(); Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx) - if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx)) { + if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) { if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), TheLoop)) { reportVectorizationFailure("Found unvectorizable intrinsic", @@ -962,7 +961,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { Type *T = ST->getValueOperand()->getType(); if (!VectorType::isValidElementType(T)) { reportVectorizationFailure("Store instruction cannot be vectorized", - "store instruction cannot be vectorized", "CantVectorizeStore", ORE, TheLoop, ST); return false; } @@ -976,7 +974,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { reportVectorizationFailure( "nontemporal store instruction cannot be vectorized", - "nontemporal store instruction cannot be vectorized", "CantVectorizeNontemporalStore", ORE, TheLoop, ST); return false; } @@ -991,7 +988,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { reportVectorizationFailure( "nontemporal load instruction cannot be vectorized", - "nontemporal load instruction cannot be vectorized", "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); return false; } @@ -1020,7 +1016,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } reportVectorizationFailure("Value cannot be used outside the loop", - "value cannot be used outside the loop", "ValueUsedOutsideLoop", ORE, TheLoop, &I); return false; } @@ -1375,6 +1370,16 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence( } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { + // When vectorizing early exits, create predicates for the latch block only. + // The early exiting block must be a direct predecessor of the latch at the + // moment. + BasicBlock *Latch = TheLoop->getLoopLatch(); + if (hasUncountableEarlyExit()) { + assert( + is_contained(predecessors(Latch), getUncountableEarlyExitingBlock()) && + "Uncountable exiting block must be a direct predecessor of latch"); + return BB == Latch; + } return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } @@ -1432,9 +1437,7 @@ bool LoopVectorizationLegality::blockCanBePredicated( bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { reportVectorizationFailure("If-conversion is disabled", - "if-conversion is disabled", - "IfConversionDisabled", - ORE, TheLoop); + "IfConversionDisabled", ORE, TheLoop); return false; } @@ -1483,14 +1486,12 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (isa<SwitchInst>(BB->getTerminator())) { if (TheLoop->isLoopExiting(BB)) { reportVectorizationFailure("Loop contains an unsupported switch", - "loop contains an unsupported switch", "LoopContainsUnsupportedSwitch", ORE, TheLoop, BB->getTerminator()); return false; } } else if (!isa<BranchInst>(BB->getTerminator())) { reportVectorizationFailure("Loop contains an unsupported terminator", - "loop contains an unsupported terminator", "LoopContainsUnsupportedTerminator", ORE, TheLoop, BB->getTerminator()); return false; @@ -1500,8 +1501,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (blockNeedsPredication(BB) && !blockCanBePredicated(BB, SafePointers, MaskedOp)) { reportVectorizationFailure( - "Control flow cannot be substituted for a select", - "control flow cannot be substituted for a select", "NoCFGForSelect", + "Control flow cannot be substituted for a select", "NoCFGForSelect", ORE, TheLoop, BB->getTerminator()); return false; } @@ -1691,8 +1691,6 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { } else if (!IsSafeOperation(&I)) { reportVectorizationFailure("Early exit loop contains operations that " "cannot be speculatively executed", - "Early exit loop contains operations that " - "cannot be speculatively executed", "UnsafeOperationsEarlyExitLoop", ORE, TheLoop); return false; @@ -1754,9 +1752,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { if (!canVectorizeOuterLoop()) { reportVectorizationFailure("Unsupported outer loop", - "unsupported outer loop", - "UnsupportedOuterLoop", - ORE, TheLoop); + "UnsupportedOuterLoop", ORE, TheLoop); // TODO: Implement DoExtraAnalysis when subsequent legal checks support // outer loops. return false; @@ -1788,13 +1784,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { HasUncountableEarlyExit = false; if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { + HasUncountableEarlyExit = true; if (!isVectorizableEarlyExitLoop()) { + UncountableExitingBlocks.clear(); + HasUncountableEarlyExit = false; if (DoExtraAnalysis) Result = false; else return false; - } else - HasUncountableEarlyExit = true; + } } // Go over each instruction and look at memory deps. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index fbcf181a45a6..26a2de8c8097 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -222,21 +222,24 @@ public: VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {}, const Twine &Name = "") { - return tryInsertInstruction(new VPInstruction( - Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name)); + return tryInsertInstruction( + new VPInstruction(Ptr, Offset, GEPNoWrapFlags::none(), DL, Name)); } VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {}, const Twine &Name = "") { - return tryInsertInstruction(new VPInstruction( - Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name)); + return tryInsertInstruction( + new VPInstruction(Ptr, Offset, GEPNoWrapFlags::inBounds(), DL, Name)); } + /// Convert the input value \p Current to the corresponding value of an + /// induction with \p Start and \p Step values, using \p Start + \p Current * + /// \p Step. VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, - VPCanonicalIVPHIRecipe *CanonicalIV, - VPValue *Step, const Twine &Name = "") { + VPValue *Current, VPValue *Step, + const Twine &Name = "") { return tryInsertInstruction( - new VPDerivedIVRecipe(Kind, FPBinOp, Start, CanonicalIV, Step, Name)); + new VPDerivedIVRecipe(Kind, FPBinOp, Start, Current, Step, Name)); } VPScalarCastRecipe *createScalarCast(Instruction::CastOps Opcode, VPValue *Op, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3c7c044a0427..f2f8a85b7cc2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -385,6 +385,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent( cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants")); +static cl::opt<bool> EnableEarlyExitVectorization( + "enable-early-exit-vectorization", cl::init(false), cl::Hidden, + cl::desc( + "Enable vectorization of early exit loops with uncountable exits.")); + // Likelyhood of bypassing the vectorized loop because assumptions about SCEV // variables not overflowing do not hold. See `emitSCEVChecks`. static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; @@ -474,7 +479,8 @@ public: AC(AC), ORE(ORE), VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), - PSI(PSI), RTChecks(RTChecks), Plan(Plan) { + PSI(PSI), RTChecks(RTChecks), Plan(Plan), + VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -487,12 +493,11 @@ public: /// on, while the old loop will be used as the scalar remainder. Control flow /// is generated around the vectorized (and scalar epilogue) loops consisting /// of various checks and bypasses. Return the pre-header block of the new - /// loop and the start value for the canonical induction, if it is != 0. The - /// latter is the case when vectorizing the epilogue loop. In the case of - /// epilogue vectorization, this function is overriden to handle the more - /// complex control flow around the loops. \p ExpandedSCEVs is used to - /// look up SCEV expansions for expressions needed during skeleton creation. - virtual std::pair<BasicBlock *, Value *> + /// loop. In the case of epilogue vectorization, this function is overriden to + /// handle the more complex control flow around the loops. \p ExpandedSCEVs is + /// used to look up SCEV expansions for expressions needed during skeleton + /// creation. + virtual BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); /// Fix the vectorized code, taking care of header phi's, and more. @@ -513,18 +518,6 @@ public: /// Fix the non-induction PHIs in \p Plan. void fixNonInductionPHIs(VPTransformState &State); - /// Create a new phi node for the induction variable \p OrigPhi to resume - /// iteration count in the scalar epilogue, from where the vectorized loop - /// left off. \p Step is the SCEV-expanded induction step to use. In cases - /// where the loop skeleton is more complicated (i.e., epilogue vectorization) - /// and the resume values can come from an additional bypass block, the \p - /// AdditionalBypass pair provides information about the bypass block and the - /// end value on the edge from bypass to this loop. - PHINode *createInductionResumeValue( - PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, - ArrayRef<BasicBlock *> BypassBlocks, - std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); - /// Returns the original loop trip count. Value *getTripCount() const { return TripCount; } @@ -533,6 +526,20 @@ public: /// count of the original loop for both main loop and epilogue vectorization. void setTripCount(Value *TC) { TripCount = TC; } + // Retrieve the additional bypass value associated with an original + /// induction header phi. + Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const { + return Induction2AdditionalBypassValue.at(OrigPhi); + } + + /// Return the additional bypass block which targets the scalar loop by + /// skipping the epilogue loop after completing the main loop. + BasicBlock *getAdditionalBypassBlock() const { + assert(AdditionalBypassBlock && + "Trying to access AdditionalBypassBlock but it has not been set"); + return AdditionalBypassBlock; + } + protected: friend class LoopVectorizationPlanner; @@ -566,21 +573,21 @@ protected: /// vector loop preheader, middle block and scalar preheader. void createVectorLoopSkeleton(StringRef Prefix); - /// Create new phi nodes for the induction variables to resume iteration count - /// in the scalar epilogue, from where the vectorized loop left off. - /// In cases where the loop skeleton is more complicated (eg. epilogue - /// vectorization) and the resume values can come from an additional bypass - /// block, the \p AdditionalBypass pair provides information about the bypass - /// block and the end value on the edge from bypass to this loop. - void createInductionResumeValues( - const SCEV2ValueTy &ExpandedSCEVs, - std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); + /// Create and record the values for induction variables to resume coming from + /// the additional bypass block. + void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, + Value *MainVectorTripCount); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. virtual void printDebugTracesAtStart() {} virtual void printDebugTracesAtEnd() {} + /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the + /// vector preheader and its predecessor, also connecting the new block to the + /// scalar preheader. + void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); + /// The original loop. Loop *OrigLoop; @@ -664,7 +671,21 @@ protected: /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; + /// Mapping of induction phis to their additional bypass values. They + /// need to be added as operands to phi nodes in the scalar loop preheader + /// after the epilogue skeleton has been created. + DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue; + + /// The additional bypass block which conditionally skips over the epilogue + /// loop after executing the main loop. Needed to resume inductions and + /// reductions during epilogue vectorization. + BasicBlock *AdditionalBypassBlock = nullptr; + VPlan &Plan; + + /// The vector preheader block of \p Plan, used as target for check blocks + /// introduced during skeleton creation. + VPBlockBase *VectorPHVPB; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -681,10 +702,13 @@ struct EpilogueLoopVectorizationInfo { BasicBlock *MemSafetyCheck = nullptr; Value *TripCount = nullptr; Value *VectorTripCount = nullptr; + VPlan &EpiloguePlan; EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, - ElementCount EVF, unsigned EUF) - : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { + ElementCount EVF, unsigned EUF, + VPlan &EpiloguePlan) + : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF), + EpiloguePlan(EpiloguePlan) { assert(EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."); } @@ -714,15 +738,15 @@ public: // Override this function to handle the more complex control flow around the // three loops. - std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( - const SCEV2ValueTy &ExpandedSCEVs) final { + BasicBlock * + createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final { return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. - virtual std::pair<BasicBlock *, Value *> + virtual BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; /// Holds and updates state information required to vectorize the main loop @@ -751,7 +775,7 @@ public: EPI, LVL, CM, BFI, PSI, Check, Plan) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - std::pair<BasicBlock *, Value *> + BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; protected: @@ -786,7 +810,7 @@ public: } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - std::pair<BasicBlock *, Value *> + BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; protected: @@ -1214,8 +1238,8 @@ public: return false; // Get the source and destination types of the truncate. - Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); - Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); + Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF); + Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF); // If the truncate is free for the given types, return false. Replacing a // free truncate with an induction variable would add an induction variable @@ -1350,9 +1374,10 @@ public: LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); return false; } - // If we might exit from anywhere but the latch, must run the exiting - // iteration in scalar form. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + // If we might exit from anywhere but the latch and early exit vectorization + // is disabled, we must run the exiting iteration in scalar form. + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && + !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) { LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting " "from latch block\n"); return true; @@ -1706,7 +1731,8 @@ private: bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast<Instruction>(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || - TheLoop->isLoopInvariant(I)) + TheLoop->isLoopInvariant(I) || + getWideningDecision(I, VF) == CM_Scalarize) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -2428,6 +2454,21 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { return VectorTripCount; } +void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { + VPBlockBase *ScalarPH = Plan.getScalarPreheader(); + VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor(); + if (PreVectorPH->getNumSuccessors() != 1) { + assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); + assert(PreVectorPH->getSuccessors()[0] == ScalarPH && + "Unexpected successor"); + VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB); + VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB); + PreVectorPH = CheckVPIRBB; + } + VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH); + PreVectorPH->swapSuccessors(); +} + void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. @@ -2502,14 +2543,15 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"); - // Update dominator for Bypass & LoopExit (if needed). - DT->changeImmediateDominator(Bypass, TCCheckBlock); BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); LoopBypassBlocks.push_back(TCCheckBlock); + + // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. + introduceCheckBlockInVPlan(TCCheckBlock); } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { @@ -2526,6 +2568,8 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { "Should already be a bypass block due to iteration count check"); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; + + introduceCheckBlockInVPlan(SCEVCheckBlock); return SCEVCheckBlock; } @@ -2562,80 +2606,40 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { AddedSafetyChecks = true; + introduceCheckBlockInVPlan(MemCheckBlock); return MemCheckBlock; } +/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p +/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must +/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All +/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. +static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { + VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); + for (auto &R : make_early_inc_range(*VPBB)) { + assert(!R.isPhi() && "Tried to move phi recipe to end of block"); + R.moveBefore(*IRVPBB, IRVPBB->end()); + } + + VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); + // VPBB is now dead and will be cleaned up when the plan gets destroyed. +} + void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); - assert((OrigLoop->getUniqueExitBlock() || + assert((OrigLoop->getUniqueLatchExitBlock() || Cost->requiresScalarEpilogue(VF.isVector())) && - "multiple exit loop without required epilogue?"); + "loops not exiting via the latch without required epilogue?"); LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "middle.block"); + replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock); LoopScalarPreHeader = SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "scalar.ph"); -} - -PHINode *InnerLoopVectorizer::createInductionResumeValue( - PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, - ArrayRef<BasicBlock *> BypassBlocks, - std::pair<BasicBlock *, Value *> AdditionalBypass) { - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); - assert(VectorTripCount && "Expected valid arguments"); - - Instruction *OldInduction = Legal->getPrimaryInduction(); - Value *EndValue = nullptr; - Value *EndValueFromAdditionalBypass = AdditionalBypass.second; - if (OrigPhi == OldInduction) { - // We know what the end value is. - EndValue = VectorTripCount; - } else { - IRBuilder<> B(LoopVectorPreHeader->getTerminator()); - - // Fast-math-flags propagate from the original induction instruction. - if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp())) - B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); - - EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), - Step, II.getKind(), II.getInductionBinOp()); - EndValue->setName("ind.end"); - - // Compute the end value for the additional bypass (if applicable). - if (AdditionalBypass.first) { - B.SetInsertPoint(AdditionalBypass.first, - AdditionalBypass.first->getFirstInsertionPt()); - EndValueFromAdditionalBypass = - emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(), - Step, II.getKind(), II.getInductionBinOp()); - EndValueFromAdditionalBypass->setName("ind.end"); - } - } - - // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = - PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", - LoopScalarPreHeader->getFirstNonPHIIt()); - // Copy original phi DL over to the new one. - BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); - - // The new PHI merges the original incoming value, in case of a bypass, - // or the value at the end of the vectorized loop. - BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); - - // Fix the scalar body counter (PHI node). - // The old induction's phi node in the scalar body needs the truncated - // value. - for (BasicBlock *BB : BypassBlocks) - BCResumeVal->addIncoming(II.getStartValue(), BB); - - if (AdditionalBypass.first) - BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, - EndValueFromAdditionalBypass); - return BCResumeVal; + replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); } /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV @@ -2652,31 +2656,66 @@ static Value *getExpandedStep(const InductionDescriptor &ID, return I->second; } -void InnerLoopVectorizer::createInductionResumeValues( - const SCEV2ValueTy &ExpandedSCEVs, - std::pair<BasicBlock *, Value *> AdditionalBypass) { - assert(((AdditionalBypass.first && AdditionalBypass.second) || - (!AdditionalBypass.first && !AdditionalBypass.second)) && - "Inconsistent information about additional bypass."); - // We are going to resume the execution of the scalar loop. - // Go over all of the induction variables that we found and fix the - // PHIs that are left in the scalar version of the loop. - // The starting values of PHI nodes depend on the counter of the last - // iteration in the vectorized loop. - // If we come from a bypass edge then we need to start from the original - // start value. +/// Knowing that loop \p L executes a single vector iteration, add instructions +/// that will get simplified and thus should not have any cost to \p +/// InstsToIgnore. +static void addFullyUnrolledInstructionsToIgnore( + Loop *L, const LoopVectorizationLegality::InductionList &IL, + SmallPtrSetImpl<Instruction *> &InstsToIgnore) { + auto *Cmp = L->getLatchCmpInst(); + if (Cmp) + InstsToIgnore.insert(Cmp); + for (const auto &KV : IL) { + // Extract the key by hand so that it can be used in the lambda below. Note + // that captured structured bindings are a C++20 extension. + const PHINode *IV = KV.first; + + // Get next iteration value of the induction variable. + Instruction *IVInst = + cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch())); + if (all_of(IVInst->users(), + [&](const User *U) { return U == IV || U == Cmp; })) + InstsToIgnore.insert(IVInst); + } +} + +void InnerLoopVectorizer::createInductionAdditionalBypassValues( + const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) { + assert(MainVectorTripCount && "Must have bypass information"); + + Instruction *OldInduction = Legal->getPrimaryInduction(); + IRBuilder<> BypassBuilder(getAdditionalBypassBlock(), + getAdditionalBypassBlock()->getFirstInsertionPt()); for (const auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; const InductionDescriptor &II = InductionEntry.second; - PHINode *BCResumeVal = createInductionResumeValue( - OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, - AdditionalBypass); - OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); + Value *Step = getExpandedStep(II, ExpandedSCEVs); + // For the primary induction the additional bypass end value is known. + // Otherwise it is computed. + Value *EndValueFromAdditionalBypass = MainVectorTripCount; + if (OrigPhi != OldInduction) { + auto *BinOp = II.getInductionBinOp(); + // Fast-math-flags propagate from the original induction instruction. + if (isa_and_nonnull<FPMathOperator>(BinOp)) + BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags()); + + // Compute the end value for the additional bypass. + EndValueFromAdditionalBypass = + emitTransformedIndex(BypassBuilder, MainVectorTripCount, + II.getStartValue(), Step, II.getKind(), BinOp); + EndValueFromAdditionalBypass->setName("ind.end"); + } + + // Store the bypass value here, as it needs to be added as operand to its + // scalar preheader phi node after the epilogue skeleton has been created. + // TODO: Directly add as extra operand to the VPResumePHI recipe. + assert(!Induction2AdditionalBypassValue.contains(OrigPhi) && + "entry for OrigPhi already exits"); + Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass; } } -std::pair<BasicBlock *, Value *> -InnerLoopVectorizer::createVectorizedLoopSkeleton( +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton( const SCEV2ValueTy &ExpandedSCEVs) { /* In this function we generate a new loop. The new loop will contain @@ -2733,10 +2772,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton( // faster. emitMemRuntimeChecks(LoopScalarPreHeader); - // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(ExpandedSCEVs); - - return {LoopVectorPreHeader, nullptr}; + return LoopVectorPreHeader; } // Fix up external users of the induction variable. At this point, we are @@ -2753,8 +2789,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, // value (the value that feeds into the phi from the loop latch). // We allow both, but they, obviously, have different values. - assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); - DenseMap<Value *, Value *> MissingVals; Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock( @@ -2808,6 +2842,18 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, } } + assert((MissingVals.empty() || + all_of(MissingVals, + [MiddleBlock, this](const std::pair<Value *, Value *> &P) { + return all_of( + predecessors(cast<Instruction>(P.first)->getParent()), + [MiddleBlock, this](BasicBlock *Pred) { + return Pred == MiddleBlock || + Pred == OrigLoop->getLoopLatch(); + }); + })) && + "Expected escaping values from latch/middle.block only"); + for (auto &I : MissingVals) { PHINode *PHI = cast<PHINode>(I.first); // One corner case we have to handle is two IVs "chasing" each-other, @@ -3411,14 +3457,14 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, } InstructionCost SafeDivisorCost = 0; - auto *VecTy = ToVectorTy(I->getType(), VF); + auto *VecTy = toVectorTy(I->getType(), VF); // The cost of the select guard to ensure all lanes are well defined // after we speculate above any internal control flow. - SafeDivisorCost += TTI.getCmpSelInstrCost( - Instruction::Select, VecTy, - ToVectorTy(Type::getInt1Ty(I->getContext()), VF), - CmpInst::BAD_ICMP_PREDICATE, CostKind); + SafeDivisorCost += + TTI.getCmpSelInstrCost(Instruction::Select, VecTy, + toVectorTy(Type::getInt1Ty(I->getContext()), VF), + CmpInst::BAD_ICMP_PREDICATE, CostKind); // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. @@ -3585,10 +3631,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // Start with the conditional branches exiting the loop. If the branch // condition is an instruction contained in the loop that is only used by the - // branch, it is uniform. + // branch, it is uniform. Note conditions from uncountable early exits are not + // uniform. SmallVector<BasicBlock *> Exiting; TheLoop->getExitingBlocks(Exiting); for (BasicBlock *E : Exiting) { + if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E) + continue; auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0)); if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) AddToWorklistIfAllowed(Cmp); @@ -4147,7 +4196,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (TC == 0) { reportVectorizationFailure( - "Unable to calculate the loop count due to complex control flow", "unable to calculate the loop count due to complex control flow", "UnknownLoopCountComplexCFG", ORE, TheLoop); return FixedScalableVFPair::getNone(); @@ -4536,7 +4584,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, } auto WillWiden = [&TTI, VF](Type *ScalarTy) { - Type *VectorTy = ToVectorTy(ScalarTy, VF); + Type *VectorTy = toVectorTy(ScalarTy, VF); unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); if (!NumLegalParts) return false; @@ -4673,6 +4721,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. + // TODO: Add support for loops with an early exit. if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) return false; @@ -4921,6 +4970,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (!Legal->isSafeForAnyVectorWidth()) return 1; + // We don't attempt to perform interleaving for loops with uncountable early + // exits because the VPInstruction::AnyOf code cannot currently handle + // multiple parts. + if (Legal->hasUncountableEarlyExit()) + return 1; + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); const bool HasReductions = !Legal->getReductionVars().empty(); @@ -5105,8 +5160,9 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, HasReductions && any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { const RecurrenceDescriptor &RdxDesc = Reduction.second; - return RecurrenceDescriptor::isAnyOfRecurrenceKind( - RdxDesc.getRecurrenceKind()); + RecurKind RK = RdxDesc.getRecurrenceKind(); + return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || + RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK); }); if (HasSelectCmpReductions) { LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); @@ -5519,7 +5575,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( - cast<VectorType>(ToVectorTy(I->getType(), VF)), + cast<VectorType>(toVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, /*Extract*/ false, CostKind); ScalarCost += @@ -5538,7 +5594,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( Worklist.push_back(J); else if (needsExtract(J, VF)) { ScalarCost += TTI.getScalarizationOverhead( - cast<VectorType>(ToVectorTy(J->getType(), VF)), + cast<VectorType>(toVectorTy(J->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, /*Extract*/ true, CostKind); } @@ -5559,6 +5615,15 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { InstructionCost Cost; + // If the vector loop gets executed exactly once with the given VF, ignore the + // costs of comparison and induction instructions, as they'll get simplified + // away. + SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF; + auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking()) + addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(), + ValuesToIgnoreForVF); + // For each block. for (BasicBlock *BB : TheLoop->blocks()) { InstructionCost BlockCost; @@ -5566,7 +5631,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.count(&I) || + if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) || (VF.isVector() && VecValuesToIgnore.count(&I))) continue; @@ -5640,7 +5705,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); - Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + Type *PtrTy = toVectorTy(Ptr->getType(), VF); // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` // that it is being called from this specific place. @@ -5691,7 +5756,7 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, ElementCount VF) { Type *ValTy = getLoadStoreType(I); - auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); @@ -5723,7 +5788,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, assert(Legal->isUniformMemOp(*I, VF)); Type *ValTy = getLoadStoreType(I); - auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -5749,7 +5814,7 @@ InstructionCost LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, ElementCount VF) { Type *ValTy = getLoadStoreType(I); - auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); const Value *Ptr = getLoadStorePointerOperand(I); @@ -5767,7 +5832,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, Instruction *InsertPos = Group->getInsertPos(); Type *ValTy = getLoadStoreType(InsertPos); - auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(InsertPos); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -6012,7 +6077,7 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( return 0; InstructionCost Cost = 0; - Type *RetTy = ToVectorTy(I->getType(), VF); + Type *RetTy = toVectorTy(I->getType(), VF); if (!RetTy->isVoidTy() && (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( @@ -6278,9 +6343,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { bool MaskRequired = Legal->isMaskRequired(CI); // Compute corresponding vector type for return value and arguments. - Type *RetTy = ToVectorTy(ScalarRetTy, VF); + Type *RetTy = toVectorTy(ScalarRetTy, VF); for (Type *ScalarTy : ScalarTys) - Tys.push_back(ToVectorTy(ScalarTy, VF)); + Tys.push_back(toVectorTy(ScalarTy, VF)); // An in-loop reduction using an fmuladd intrinsic is a special case; // we don't want the normal cost for that intrinsic. @@ -6470,7 +6535,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, HasSingleCopyAfterVectorization(I, VF)); VectorTy = RetTy; } else - VectorTy = ToVectorTy(RetTy, VF); + VectorTy = toVectorTy(RetTy, VF); if (VF.isVector() && VectorTy->isVectorTy() && !TTI.getNumberOfParts(VectorTy)) @@ -6530,8 +6595,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return Switch->getNumCases() * TTI.getCmpSelInstrCost( Instruction::ICmp, - ToVectorTy(Switch->getCondition()->getType(), VF), - ToVectorTy(Type::getInt1Ty(I->getContext()), VF), + toVectorTy(Switch->getCondition()->getType(), VF), + toVectorTy(Type::getInt1Ty(I->getContext()), VF), CmpInst::ICMP_EQ, CostKind); } case Instruction::PHI: { @@ -6576,8 +6641,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, } return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( - Instruction::Select, ToVectorTy(ResultTy, VF), - ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), + Instruction::Select, toVectorTy(ResultTy, VF), + toVectorTy(Type::getInt1Ty(Phi->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); } @@ -6586,8 +6651,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (VF.isVector() && foldTailWithEVL() && Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) { IntrinsicCostAttributes ICA( - Intrinsic::vp_merge, ToVectorTy(Phi->getType(), VF), - {ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); + Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF), + {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); return TTI.getIntrinsicInstrCost(ICA, CostKind); } @@ -6727,7 +6792,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); } - VectorTy = ToVectorTy(ValTy, VF); + VectorTy = toVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, cast<CmpInst>(I)->getPredicate(), CostKind, {TTI::OK_AnyValue, TTI::OP_None}, @@ -6745,7 +6810,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (Decision == CM_Scalarize) Width = ElementCount::getFixed(1); } - VectorTy = ToVectorTy(getLoadStoreType(I), Width); + VectorTy = toVectorTy(getLoadStoreType(I), Width); return getMemoryInstructionCost(I, VF); } case Instruction::BitCast: @@ -6826,7 +6891,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, SrcScalarTy = IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]); Type *SrcVecTy = - VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; + VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy; if (canTruncateToMinimalBitwidth(I, VF)) { // If the result type is <= the source type, there will be no extend @@ -7248,6 +7313,17 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, continue; IVInsts.push_back(CI); } + + // If the vector loop gets executed exactly once with the given VF, ignore + // the costs of comparison and induction instructions, as they'll get + // simplified away. + // TODO: Remove this code after stepping away from the legacy cost model and + // adding code to simplify VPlans before calculating their costs. + auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop); + if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking()) + addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(), + CostCtx.SkipCostComputation); + for (Instruction *IVInst : IVInsts) { if (CostCtx.skipCostComputation(IVInst, VF.isVector())) continue; @@ -7344,7 +7420,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, // Pre-compute the cost for I, if it has a reduction pattern cost. for (Instruction *I : ChainOpsAndOperands) { auto ReductionCost = CM.getReductionPatternCost( - I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); + I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); if (!ReductionCost) continue; @@ -7584,7 +7660,8 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) { // fix the reduction's scalar PHI node by adding the incoming value from the // main vector loop. static void fixReductionScalarResumeWhenVectorizingEpilog( - VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock) { + VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, + BasicBlock *BypassBlock) { auto *EpiRedResult = dyn_cast<VPInstruction>(R); if (!EpiRedResult || EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult) @@ -7621,21 +7698,8 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( auto *EpiResumePhiVPI = cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi)); auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true)); - BasicBlock *LoopScalarPreHeader = EpiResumePhi->getParent(); - bool Updated = false; - for (auto *Incoming : predecessors(LoopScalarPreHeader)) { - if (is_contained(MainResumePhi->blocks(), Incoming)) { - assert(EpiResumePhi->getIncomingValueForBlock(Incoming) == - RdxDesc.getRecurrenceStartValue() && - "Trying to reset unexpected value"); - assert(!Updated && "Should update at most 1 incoming value"); - EpiResumePhi->setIncomingValueForBlock( - Incoming, MainResumePhi->getIncomingValueForBlock(Incoming)); - Updated = true; - } - } - assert(Updated && "Must update EpiResumePhi."); - (void)Updated; + EpiResumePhi->setIncomingValueForBlock( + BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); } DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( @@ -7656,23 +7720,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( VPlanTransforms::unrollByUF(BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); - - LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF - << ", UF=" << BestUF << '\n'); - BestVPlan.setName("Final VPlan"); - LLVM_DEBUG(BestVPlan.dump()); + VPlanTransforms::convertToConcreteRecipes(BestVPlan); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV, - &BestVPlan); + &BestVPlan, OrigLoop->getParentLoop(), + Legal->getWidestInductionType()); + +#ifdef EXPENSIVE_CHECKS + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); +#endif + + // 0. Generate SCEV-dependent code in the entry, including TripCount, before + // making any changes to the CFG. + if (!BestVPlan.getEntry()->empty()) + BestVPlan.getEntry()->execute(&State); - // 0. Generate SCEV-dependent code into the preheader, including TripCount, - // before making any changes to the CFG. - if (!BestVPlan.getPreheader()->empty()) { - State.CFG.PrevBB = OrigLoop->getLoopPreheader(); - State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); - BestVPlan.getPreheader()->execute(&State); - } if (!ILV.getTripCount()) ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0))); else @@ -7681,13 +7744,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - Value *CanonicalIVStartValue; - std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = - ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs - : State.ExpandedSCEVs); -#ifdef EXPENSIVE_CHECKS - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); -#endif + State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( + ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); + if (VectorizingEpilogue) + VPlanTransforms::removeDeadRecipes(BestVPlan); // Only use noalias metadata when using memory checks guaranteeing no overlap // across all iterations. @@ -7718,20 +7778,31 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute(ILV.getTripCount(), - ILV.getOrCreateVectorTripCount(nullptr), - CanonicalIVStartValue, State); - VPlanTransforms::prepareToExecute(BestVPlan); + BestVPlan.prepareToExecute( + ILV.getTripCount(), + ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); + replaceVPBBWithIRVPBB(BestVPlan.getVectorPreheader(), State.CFG.PrevBB); BestVPlan.execute(&State); - // 2.5 Collect reduction resume values. auto *ExitVPBB = BestVPlan.getMiddleBlock(); - if (VectorizingEpilogue) + // 2.5 When vectorizing the epilogue, fix reduction and induction resume + // values from the additional bypass block. + if (VectorizingEpilogue) { + assert(!ILV.Legal->hasUncountableEarlyExit() && + "Epilogue vectorisation not yet supported with early exits"); + BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); for (VPRecipeBase &R : *ExitVPBB) { fixReductionScalarResumeWhenVectorizingEpilog( - &R, State, State.CFG.VPBB2IRBB[ExitVPBB]); + &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock); + } + BasicBlock *PH = OrigLoop->getLoopPreheader(); + for (const auto &[IVPhi, _] : Legal->getInductionVars()) { + auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); + Value *V = ILV.getInductionAdditionalBypassValue(IVPhi); + Inc->setIncomingValueForBlock(BypassBlock, V); } + } // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll @@ -7758,7 +7829,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( } TargetTransformInfo::UnrollingPreferences UP; TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); - if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) + if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) addRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, @@ -7788,8 +7859,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -std::pair<BasicBlock *, Value *> -EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( +BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( const SCEV2ValueTy &ExpandedSCEVs) { createVectorLoopSkeleton(""); @@ -7820,12 +7890,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( // Generate the induction variable. EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); - // Skip induction resume value creation here because they will be created in - // the second pass for the scalar loop. The induction resume values for the - // inductions in the epilogue loop are created before executing the plan for - // the epilogue loop. - - return {LoopVectorPreHeader, nullptr}; + return LoopVectorPreHeader; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -7880,8 +7945,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"); - // Update dominator for Bypass. - DT->changeImmediateDominator(Bypass, TCCheckBlock); LoopBypassBlocks.push_back(TCCheckBlock); // Save the trip count so we don't have to regenerate it in the @@ -7896,6 +7959,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); + introduceCheckBlockInVPlan(TCCheckBlock); return TCCheckBlock; } @@ -7905,7 +7969,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -std::pair<BasicBlock *, Value *> +BasicBlock * EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( const SCEV2ValueTy &ExpandedSCEVs) { createVectorLoopSkeleton("vec.epilog."); @@ -7918,6 +7982,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( nullptr, "vec.epilog.iter.check", true); emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, VecEpilogueIterationCountCheck); + AdditionalBypassBlock = VecEpilogueIterationCountCheck; // Adjust the control flow taking the state info from the main loop // vectorization into account. @@ -7926,9 +7991,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( VecEpilogueIterationCountCheck, LoopVectorPreHeader); - DT->changeImmediateDominator(LoopVectorPreHeader, - EPI.MainLoopIterationCountCheck); - EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( VecEpilogueIterationCountCheck, LoopScalarPreHeader); @@ -7939,19 +8001,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( VecEpilogueIterationCountCheck, LoopScalarPreHeader); - DT->changeImmediateDominator( - VecEpilogueIterationCountCheck, - VecEpilogueIterationCountCheck->getSinglePredecessor()); - DT->changeImmediateDominator(LoopScalarPreHeader, EPI.EpilogueIterationCountCheck); - if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) - // If there is an epilogue which must run, there's no edge from the - // middle block to exit blocks and thus no need to update the immediate - // dominator of the exit blocks. - DT->changeImmediateDominator(OrigLoop->getUniqueLatchExitBlock(), - EPI.EpilogueIterationCountCheck); - // Keep track of bypass blocks, as they feed start values to the induction and // reduction phis in the scalar loop preheader. if (EPI.SCEVSafetyCheck) @@ -7988,27 +8039,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( Phi->removeIncomingValue(EPI.MemSafetyCheck); } - // Generate a resume induction for the vector epilogue and put it in the - // vector epilogue preheader - Type *IdxTy = Legal->getWidestInductionType(); - PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val"); - EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt()); - EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); - EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), - EPI.MainLoopIterationCountCheck); - - // Generate induction resume values. These variables save the new starting - // indexes for the scalar loop. They are used to test if there are any tail - // iterations left once the vector loop has completed. - // Note that when the vectorized epilogue is skipped due to iteration count - // check, then the resume value for the induction variable comes from - // the trip count of the main vector loop, hence passing the AdditionalBypass - // argument. - createInductionResumeValues(ExpandedSCEVs, - {VecEpilogueIterationCountCheck, - EPI.VectorTripCount} /* AdditionalBypass */); - - return {LoopVectorPreHeader, EPResumeVal}; + // Generate bypass values from the additional bypass block. Note that when the + // vectorized epilogue is skipped due to iteration count check, then the + // resume value for the induction variable comes from the trip count of the + // main vector loop, passed as the second argument. + createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount); + return LoopVectorPreHeader; } BasicBlock * @@ -8054,6 +8090,16 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( } ReplaceInstWithInst(Insert->getTerminator(), &BI); LoopBypassBlocks.push_back(Insert); + + // A new entry block has been created for the epilogue VPlan. Hook it in, as + // otherwise we would try to modify the entry to the main vector loop. + VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert); + VPBasicBlock *OldEntry = Plan.getEntry(); + VPBlockUtils::reassociateBlocks(OldEntry, NewEntry); + Plan.setEntry(NewEntry); + // OldEntry is now dead and will be cleaned up when the plan gets destroyed. + + introduceCheckBlockInVPlan(Insert); return Insert; } @@ -8160,8 +8206,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { // If source is an exiting block, we know the exit edge is dynamically dead // in the vector loop, and thus we don't need to restrict the mask. Avoid - // adding uses of an otherwise potentially dead instruction. - if (OrigLoop->isLoopExiting(Src)) + // adding uses of an otherwise potentially dead instruction unless we are + // vectorizing a loop with uncountable exits. In that case, we always + // materialize the mask. + if (OrigLoop->isLoopExiting(Src) && + Src != Legal->getUncountableEarlyExitingBlock()) return EdgeMaskCache[Edge] = SrcMask; VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); @@ -8297,10 +8346,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, if (Reverse) VectorPtr = new VPReverseVectorPointerRecipe( Ptr, &Plan.getVF(), getLoadStoreType(I), - GEP ? GEP->isInBounds() : false, I->getDebugLoc()); + GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds() + : GEPNoWrapFlags::none(), + I->getDebugLoc()); else VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), - GEP ? GEP->isInBounds() : false, + GEP ? GEP->getNoWrapFlags() + : GEPNoWrapFlags::none(), I->getDebugLoc()); Builder.getInsertBlock()->appendRecipe(VectorPtr); Ptr = VectorPtr; @@ -8329,11 +8381,12 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), - IndDesc, TruncI); + IndDesc, TruncI, + TruncI->getDebugLoc()); } assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), - IndDesc); + IndDesc, Phi->getDebugLoc()); } VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( @@ -8355,7 +8408,8 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( [&](ElementCount VF) { return CM.isScalarAfterVectorization(Phi, VF); }, - Range)); + Range), + Phi->getDebugLoc()); } return nullptr; } @@ -8809,14 +8863,55 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } -/// Create resume phis in the scalar preheader for first-order recurrences and -/// reductions and update the VPIRInstructions wrapping the original phis in the -/// scalar header. +/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the +/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute +/// the end value of the induction. +static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, + VPBuilder &VectorPHBuilder, + VPBuilder &ScalarPHBuilder, + VPTypeAnalysis &TypeInfo, + VPValue *VectorTC) { + auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV); + // Truncated wide inductions resume from the last lane of their vector value + // in the last vector iteration which is handled elsewhere. + if (WideIntOrFp && WideIntOrFp->getTruncInst()) + return nullptr; + + VPValue *Start = WideIV->getStartValue(); + VPValue *Step = WideIV->getStepValue(); + const InductionDescriptor &ID = WideIV->getInductionDescriptor(); + VPValue *EndValue = VectorTC; + if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { + EndValue = VectorPHBuilder.createDerivedIV( + ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), + Start, VectorTC, Step); + } + + // EndValue is derived from the vector trip count (which has the same type as + // the widest induction) and thus may be wider than the induction here. + Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV); + if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) { + EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue, + ScalarTypeOfWideIV); + } + + auto *ResumePhiRecipe = + ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start}, + WideIV->getDebugLoc(), "bc.resume.val"); + return ResumePhiRecipe; +} + +/// Create resume phis in the scalar preheader for first-order recurrences, +/// reductions and inductions, and update the VPIRInstructions wrapping the +/// original phis in the scalar header. static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); auto *ScalarPH = Plan.getScalarPreheader(); auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor()); - VPBuilder ScalarPHBuilder(ScalarPH); + VPBuilder VectorPHBuilder( + cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor())); VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); + VPBuilder ScalarPHBuilder(ScalarPH); VPValue *OneVPV = Plan.getOrAddLiveIn( ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) { @@ -8824,9 +8919,23 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction()); if (!ScalarPhiI) break; + auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI)); - if (!isa<VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe>(VectorPhiR)) + if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { + if (VPValue *ResumePhi = addResumePhiRecipeForInduction( + WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, + &Plan.getVectorTripCount())) { + ScalarPhiIRI->addOperand(ResumePhi); + continue; + } + // TODO: Also handle truncated inductions here. Computing end-values + // separately should be done as VPlan-to-VPlan optimization, after + // legalizing all resume values to use the last lane from the loop. + assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() && + "should only skip truncated wide inductions"); continue; + } + // The backedge value provides the value to resume coming out of a loop, // which for FORs is a vector whose last element needs to be extracted. The // start value provides the value if the loop is bypassed. @@ -8852,14 +8961,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { static SetVector<VPIRInstruction *> collectUsersInExitBlocks( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector<PHINode *, InductionDescriptor> &Inductions) { + auto *MiddleVPBB = Plan.getMiddleBlock(); SetVector<VPIRInstruction *> ExitUsersToFix; for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { - BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock(); - BasicBlock *ExitingBB = find_singleton<BasicBlock>( - to_vector(predecessors(ExitBB)), - [OrigLoop](BasicBlock *Pred, bool AllowRepeats) { - return OrigLoop->contains(Pred) ? Pred : nullptr; - }); for (VPRecipeBase &R : *ExitVPBB) { auto *ExitIRI = dyn_cast<VPIRInstruction>(&R); if (!ExitIRI) @@ -8867,35 +8971,48 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks( auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction()); if (!ExitPhi) break; - Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); - VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); - // Exit values for inductions are computed and updated outside of VPlan - // and independent of induction recipes. - // TODO: Compute induction exit values in VPlan. - if ((isa<VPWidenIntOrFpInductionRecipe>(V) && - !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || - isa<VPWidenPointerInductionRecipe>(V) || - (isa<Instruction>(IncomingValue) && - OrigLoop->contains(cast<Instruction>(IncomingValue)) && - any_of(IncomingValue->users(), [&Inductions](User *U) { - auto *P = dyn_cast<PHINode>(U); - return P && Inductions.contains(P); - }))) - continue; - ExitUsersToFix.insert(ExitIRI); - ExitIRI->addOperand(V); + for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) { + BasicBlock *ExitingBB = OrigLoop->getLoopLatch(); + if (PredVPBB != MiddleVPBB) { + SmallVector<BasicBlock *> ExitingBlocks; + OrigLoop->getExitingBlocks(ExitingBlocks); + assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks"); + ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1] + : ExitingBlocks[0]; + } + Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); + VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); + // Exit values for inductions are computed and updated outside of VPlan + // and independent of induction recipes. + // TODO: Compute induction exit values in VPlan. + if ((isa<VPWidenIntOrFpInductionRecipe>(V) && + !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || + isa<VPWidenPointerInductionRecipe>(V) || + (isa<Instruction>(IncomingValue) && + OrigLoop->contains(cast<Instruction>(IncomingValue)) && + any_of(IncomingValue->users(), [&Inductions](User *U) { + auto *P = dyn_cast<PHINode>(U); + return P && Inductions.contains(P); + }))) { + if (ExitVPBB->getSinglePredecessor() == MiddleVPBB) + continue; + } + ExitUsersToFix.insert(ExitIRI); + ExitIRI->addOperand(V); + } } } return ExitUsersToFix; } // Add exit values to \p Plan. Extracts are added for each entry in \p -// ExitUsersToFix if needed and their operands are updated. -static void +// ExitUsersToFix if needed and their operands are updated. Returns true if all +// exit users can be handled, otherwise return false. +static bool addUsersInExitBlocks(VPlan &Plan, const SetVector<VPIRInstruction *> &ExitUsersToFix) { if (ExitUsersToFix.empty()) - return; + return true; auto *MiddleVPBB = Plan.getMiddleBlock(); VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); @@ -8903,20 +9020,25 @@ addUsersInExitBlocks(VPlan &Plan, // Introduce extract for exiting values and update the VPIRInstructions // modeling the corresponding LCSSA phis. for (VPIRInstruction *ExitIRI : ExitUsersToFix) { - VPValue *V = ExitIRI->getOperand(0); - // Pass live-in values used by exit phis directly through to their users in - // the exit block. - if (V->isLiveIn()) - continue; + for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) { + // Pass live-in values used by exit phis directly through to their users + // in the exit block. + if (Op->isLiveIn()) + continue; + + // Currently only live-ins can be used by exit values from blocks not + // exiting via the vector latch through to the middle block. + if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB) + return false; - assert(ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB && - "Exit value not handled yet for this edge."); - LLVMContext &Ctx = ExitIRI->getInstruction().getContext(); - VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd, - {V, Plan.getOrAddLiveIn(ConstantInt::get( - IntegerType::get(Ctx, 32), 1))}); - ExitIRI->setOperand(0, Ext); + LLVMContext &Ctx = ExitIRI->getInstruction().getContext(); + VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd, + {Op, Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(Ctx, 32), 1))}); + ExitIRI->setOperand(Idx, Ext); + } } + return true; } /// Handle users in the exit block for first order reductions in the original @@ -9176,7 +9298,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPBB->appendRecipe(Recipe); } - VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); + VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB); VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); } @@ -9189,11 +9311,22 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); + if (auto *UncountableExitingBlock = + Legal->getUncountableEarlyExitingBlock()) { + VPlanTransforms::handleUncountableEarlyExit( + *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); + } addScalarResumePhis(RecipeBuilder, *Plan); SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks( OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); - addUsersInExitBlocks(*Plan, ExitUsersToFix); + if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) { + reportVectorizationFailure( + "Some exit values in loop with uncountable exit not supported yet", + "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); + return nullptr; + } + // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. @@ -9304,6 +9437,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { bool HasNUW = true; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DebugLoc()); + + // Collect mapping of IR header phis to header phi recipes, to be used in + // addScalarResumePhis. + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); + for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + if (isa<VPCanonicalIVPHIRecipe>(&R)) + continue; + auto *HeaderR = cast<VPHeaderPHIRecipe>(&R); + RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR); + } + addScalarResumePhis(RecipeBuilder, *Plan); + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } @@ -9334,8 +9479,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind Kind = RdxDesc.getRecurrenceKind(); - assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && - "AnyOf reductions are not allowed for in-loop reductions"); + assert( + !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && + !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) && + "AnyOf and FindLast reductions are not allowed for in-loop reductions"); // Collect the chain of "link" recipes for the reduction starting at PhiR. SetVector<VPSingleDefRecipe *> Worklist; @@ -9439,9 +9586,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (CM.blockNeedsPredicationForAnyReason(BB)) CondOp = RecipeBuilder.getBlockInMask(BB); - VPReductionRecipe *RedRecipe = - new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp, - CondOp, CM.useOrderedReductions(RdxDesc)); + auto *RedRecipe = new VPReductionRecipe( + RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp, + CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc()); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. // Note that this transformation may leave over dead recipes (including @@ -9566,6 +9713,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // Convert the reduction phi to operate on bools. PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( OrigLoop->getHeader()->getContext()))); + continue; + } + + if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + // Adjust the start value for FindLastIV recurrences to use the sentinel + // value after generating the ResumePhi recipe, which uses the original + // start value. + PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue())); } } @@ -9581,13 +9737,18 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); Value *Step = State.get(getStepValue(), VPLane(0)); - Value *CanonicalIV = State.get(getOperand(1), VPLane(0)); + Value *Index = State.get(getOperand(1), VPLane(0)); Value *DerivedIV = emitTransformedIndex( - State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, - Kind, cast_if_present<BinaryOperator>(FPBinOp)); + State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind, + cast_if_present<BinaryOperator>(FPBinOp)); DerivedIV->setName(Name); - assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); - + // If index is the vector trip count, the concrete value will only be set in + // prepareToExecute, leading to missed simplifications, e.g. if it is 0. + // TODO: Remove the special case for the vector trip count once it is computed + // in VPlan and can be used during VPlan simplification. + assert((DerivedIV != Index || + getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) && + "IV didn't need transforming?"); State.set(this, DerivedIV, VPLane(0)); } @@ -9897,6 +10058,164 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || !EnableLoopVectorization) {} +/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue +/// vectorization. Remove ResumePhis from \p MainPlan for inductions that +/// don't have a corresponding wide induction in \p EpiPlan. +static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { + // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those + // will need their resume-values computed in the main vector loop. Others + // can be removed from the main VPlan. + SmallPtrSet<PHINode *, 2> EpiWidenedPhis; + for (VPRecipeBase &R : + EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + if (isa<VPCanonicalIVPHIRecipe>(&R)) + continue; + EpiWidenedPhis.insert( + cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue())); + } + for (VPRecipeBase &R : make_early_inc_range( + *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) { + auto *VPIRInst = cast<VPIRInstruction>(&R); + auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction()); + if (!IRI) + break; + if (EpiWidenedPhis.contains(IRI)) + continue; + // There is no corresponding wide induction in the epilogue plan that would + // need a resume value. Remove the VPIRInst wrapping the scalar header phi + // together with the corresponding ResumePhi. The resume values for the + // scalar loop will be created during execution of EpiPlan. + VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe(); + VPIRInst->eraseFromParent(); + ResumePhi->eraseFromParent(); + } + VPlanTransforms::removeDeadRecipes(MainPlan); + + using namespace VPlanPatternMatch; + VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader(); + VPValue *VectorTC = &MainPlan.getVectorTripCount(); + // If there is a suitable resume value for the canonical induction in the + // scalar (which will become vector) epilogue loop we are done. Otherwise + // create it below. + if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) { + return match(&R, m_VPInstruction<VPInstruction::ResumePhi>( + m_Specific(VectorTC), m_SpecificInt(0))); + })) + return; + VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin()); + ScalarPHBuilder.createNaryOp( + VPInstruction::ResumePhi, + {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {}, + "vec.epilog.resume.val"); +} + +/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded +/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. +static void +preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, + const SCEV2ValueTy &ExpandedSCEVs, + const EpilogueLoopVectorizationInfo &EPI) { + VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); + Header->setName("vec.epilog.vector.body"); + + // Re-use the trip count and steps expanded for the main loop, as + // skeleton creation needs it as a value that dominates both the scalar + // and vector epilogue loops + // TODO: This is a workaround needed for epilogue vectorization and it + // should be removed once induction resume value creation is done + // directly in VPlan. + for (auto &R : make_early_inc_range(*Plan.getEntry())) { + auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R); + if (!ExpandR) + continue; + auto *ExpandedVal = + Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); + ExpandR->replaceAllUsesWith(ExpandedVal); + if (Plan.getTripCount() == ExpandR) + Plan.resetTripCount(ExpandedVal); + ExpandR->eraseFromParent(); + } + + // Ensure that the start values for all header phi recipes are updated before + // vectorizing the epilogue loop. + for (VPRecipeBase &R : Header->phis()) { + if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) { + // When vectorizing the epilogue loop, the canonical induction start + // value needs to be changed from zero to the value after the main + // vector loop. Find the resume value created during execution of the main + // VPlan. + // FIXME: Improve modeling for canonical IV start values in the epilogue + // loop. + BasicBlock *MainMiddle = find_singleton<BasicBlock>( + predecessors(L->getLoopPreheader()), + [&EPI](BasicBlock *BB, bool) -> BasicBlock * { + if (BB != EPI.MainLoopIterationCountCheck && + BB != EPI.EpilogueIterationCountCheck && + BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck) + return BB; + return nullptr; + }); + using namespace llvm::PatternMatch; + Type *IdxTy = IV->getScalarType(); + PHINode *EPResumeVal = find_singleton<PHINode>( + L->getLoopPreheader()->phis(), + [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * { + if (P.getType() == IdxTy && + P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount && + match( + P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck), + m_SpecificInt(0))) + return &P; + return nullptr; + }); + assert(EPResumeVal && "must have a resume value for the canonical IV"); + VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); + assert(all_of(IV->users(), + [](const VPUser *U) { + return isa<VPScalarIVStepsRecipe>(U) || + isa<VPScalarCastRecipe>(U) || + isa<VPDerivedIVRecipe>(U) || + cast<VPInstruction>(U)->getOpcode() == + Instruction::Add; + }) && + "the canonical IV should only be used by its increment or " + "ScalarIVSteps when resetting the start value"); + IV->setOperand(0, VPV); + continue; + } + + Value *ResumeV = nullptr; + // TODO: Move setting of resume values to prepareToExecute. + if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { + ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr()) + ->getIncomingValueForBlock(L->getLoopPreheader()); + const RecurrenceDescriptor &RdxDesc = + ReductionPhi->getRecurrenceDescriptor(); + RecurKind RK = RdxDesc.getRecurrenceKind(); + if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { + // VPReductionPHIRecipes for AnyOf reductions expect a boolean as + // start value; compare the final value from the main vector loop + // to the start value. + IRBuilder<> Builder( + cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI()); + ResumeV = + Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); + } + } else { + // Retrieve the induction resume values for wide inductions from + // their original phi nodes in the scalar loop. + PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode(); + // Hook up to the PHINode generated by a ResumePhi recipe of main + // loop VPlan, which feeds the scalar loop. + ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader()); + } + assert(ResumeV && "Must have a resume value"); + VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); + cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); + } +} + bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); @@ -9946,12 +10265,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (LVL.hasUncountableEarlyExit()) { + if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { reportVectorizationFailure("Auto-vectorization of loops with uncountable " - "early exit is not yet supported", - "Auto-vectorization of loops with uncountable " - "early exit is not yet supported", - "UncountableEarlyExitLoopsUnsupported", ORE, L); + "early exit is not enabled", + "UncountableEarlyExitLoopsDisabled", ORE, L); return false; } @@ -9977,6 +10294,18 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (UseInterleaved) IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); + if (LVL.hasUncountableEarlyExit()) { + BasicBlock *LoopLatch = L->getLoopLatch(); + if (IAI.requiresScalarEpilogue() || + any_of(LVL.getCountableExitingBlocks(), + [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) { + reportVectorizationFailure("Auto-vectorization of early exit loops " + "requiring a scalar epilogue is unsupported", + "UncountableEarlyExitUnsupported", ORE, L); + return false; + } + } + // Check the function attributes and profiles to find out if this function // should be optimized for size. ScalarEpilogueLowering SEL = @@ -10243,11 +10572,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. - EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); + VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); + preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); + EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, + BestEpiPlan); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks, *BestMainPlan); - auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, false); ++LoopsVectorized; @@ -10256,84 +10587,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { // edges from the first pass. EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; - VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF); EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks, BestEpiPlan); - - VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); - VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); - Header->setName("vec.epilog.vector.body"); - - // Re-use the trip count and steps expanded for the main loop, as - // skeleton creation needs it as a value that dominates both the scalar - // and vector epilogue loops - // TODO: This is a workaround needed for epilogue vectorization and it - // should be removed once induction resume value creation is done - // directly in VPlan. EpilogILV.setTripCount(MainILV.getTripCount()); - for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { - auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R); - if (!ExpandR) - continue; - auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn( - ExpandedSCEVs.find(ExpandR->getSCEV())->second); - ExpandR->replaceAllUsesWith(ExpandedVal); - if (BestEpiPlan.getTripCount() == ExpandR) - BestEpiPlan.resetTripCount(ExpandedVal); - ExpandR->eraseFromParent(); - } - - // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, - // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated - // before vectorizing the epilogue loop. - for (VPRecipeBase &R : Header->phis()) { - if (isa<VPCanonicalIVPHIRecipe>(&R)) - continue; + preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); - Value *ResumeV = nullptr; - // TODO: Move setting of resume values to prepareToExecute. - if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { - ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr()) - ->getIncomingValueForBlock(L->getLoopPreheader()); - const RecurrenceDescriptor &RdxDesc = - ReductionPhi->getRecurrenceDescriptor(); - RecurKind RK = RdxDesc.getRecurrenceKind(); - if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { - // VPReductionPHIRecipes for AnyOf reductions expect a boolean as - // start value; compare the final value from the main vector loop - // to the start value. - IRBuilder<> Builder( - cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI()); - ResumeV = Builder.CreateICmpNE(ResumeV, - RdxDesc.getRecurrenceStartValue()); - } - } else { - // Create induction resume values for both widened pointer and - // integer/fp inductions and update the start value of the induction - // recipes to use the resume value. - PHINode *IndPhi = nullptr; - const InductionDescriptor *ID; - if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { - IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); - ID = &Ind->getInductionDescriptor(); - } else { - auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); - IndPhi = WidenInd->getPHINode(); - ID = &WidenInd->getInductionDescriptor(); - } - - ResumeV = MainILV.createInductionResumeValue( - IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), - {EPI.MainLoopIterationCountCheck}); - } - assert(ResumeV && "Must have a resume value"); - VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV); - cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); - } - - assert(DT->verify(DominatorTree::VerificationLevel::Fast) && - "DT not preserved correctly"); LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT, true, &ExpandedSCEVs); ++LoopsEpilogueVectorized; @@ -10361,6 +10620,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { checkMixedPrecision(L, ORE); } + assert(DT->verify(DominatorTree::VerificationLevel::Fast) && + "DT not preserved correctly"); + std::optional<MDNode *> RemainderLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 33657c26356d..f52ddfda5e64 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -344,6 +344,8 @@ static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) { unsigned SVNumElements = cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements(); unsigned ShuffleMaskSize = SV->getShuffleMask().size(); + if (SVNumElements % ShuffleMaskSize != 0) + return 0; unsigned GroupSize = SVNumElements / ShuffleMaskSize; if (GroupSize == 0 || (VL.size() % GroupSize) != 0) return 0; @@ -514,7 +516,7 @@ static bool isCommutative(Instruction *I) { BO->uses(), [](const Use &U) { // Commutative, if icmp eq/ne sub, 0 - ICmpInst::Predicate Pred; + CmpPredicate Pred; if (match(U.getUser(), m_ICmp(Pred, m_Specific(U.get()), m_Zero())) && (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) @@ -700,7 +702,8 @@ static SmallBitVector isUndefVector(const Value *V, /// TODO: Can we split off and reuse the shuffle mask detection from /// ShuffleVectorInst/getShuffleCost? static std::optional<TargetTransformInfo::ShuffleKind> -isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { +isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, + AssumptionCache *AC) { const auto *It = find_if(VL, IsaPred<ExtractElementInst>); if (It == VL.end()) return std::nullopt; @@ -717,14 +720,14 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { Value *Vec1 = nullptr; Value *Vec2 = nullptr; - bool HasNonUndefVec = any_of(VL, [](Value *V) { + bool HasNonUndefVec = any_of(VL, [&](Value *V) { auto *EE = dyn_cast<ExtractElementInst>(V); if (!EE) return false; Value *Vec = EE->getVectorOperand(); if (isa<UndefValue>(Vec)) return false; - return isGuaranteedNotToBePoison(Vec); + return isGuaranteedNotToBePoison(Vec, AC); }); enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; @@ -807,14 +810,16 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) { namespace { /// Main data required for vectorization of instructions. -struct InstructionsState { - /// The very first instruction in the list with the main opcode. - Value *OpValue = nullptr; - - /// The main/alternate instruction. +class InstructionsState { + /// The main/alternate instruction. MainOp is also VL0. Instruction *MainOp = nullptr; Instruction *AltOp = nullptr; +public: + Instruction *getMainOp() const { return MainOp; } + + Instruction *getAltOp() const { return AltOp; } + /// The main/alternate opcodes for the list of instructions. unsigned getOpcode() const { return MainOp ? MainOp->getOpcode() : 0; @@ -833,9 +838,9 @@ struct InstructionsState { } InstructionsState() = delete; - InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) - : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} - static InstructionsState invalid() { return {nullptr, nullptr, nullptr}; } + InstructionsState(Instruction *MainOp, Instruction *AltOp) + : MainOp(MainOp), AltOp(AltOp) {} + static InstructionsState invalid() { return {nullptr, nullptr}; } }; } // end anonymous namespace @@ -1073,7 +1078,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, return InstructionsState::invalid(); } - return InstructionsState(V, cast<Instruction>(V), + return InstructionsState(cast<Instruction>(V), cast<Instruction>(VL[AltIndex])); } @@ -1087,7 +1092,8 @@ static bool allSameType(ArrayRef<Value *> VL) { /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, - TargetLibraryInfo *TLI) { + TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI) { if (!UserInst) return false; unsigned Opcode = UserInst->getOpcode(); @@ -1104,7 +1110,7 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, CallInst *CI = cast<CallInst>(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); return any_of(enumerate(CI->args()), [&](auto &&Arg) { - return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) && + return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) && Arg.value().get() == Scalar; }); } @@ -1842,12 +1848,12 @@ public: // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. if (S.getOpcode() && - (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() || + (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() || !S.isAltShuffle()) && all_of(Ops, [&S](Value *V) { return isa<PoisonValue>(V) || cast<Instruction>(V)->getNumOperands() == - S.MainOp->getNumOperands(); + S.getMainOp()->getNumOperands(); })) return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes : LookAheadHeuristics::ScoreSameOpcode; @@ -2017,6 +2023,9 @@ public: /// A vector of operand vectors. SmallVector<OperandDataVec, 4> OpsVec; + /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0] + /// is not IntrinsicInst, ArgSize is User::getNumOperands. + unsigned ArgSize = 0; const TargetLibraryInfo &TLI; const DataLayout &DL; @@ -2400,14 +2409,15 @@ public: } /// Go through the instructions in VL and append their operands. - void appendOperandsOfVL(ArrayRef<Value *> VL) { + void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) { assert(!VL.empty() && "Bad VL"); assert((empty() || VL.size() == getNumLanes()) && "Expected same number of lanes"); + // IntrinsicInst::isCommutative returns true if swapping the first "two" + // arguments to the intrinsic produces the same result. constexpr unsigned IntrinsicNumOperands = 2; - auto *VL0 = cast<Instruction>(*find_if(VL, IsaPred<Instruction>)); - unsigned NumOperands = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands - : VL0->getNumOperands(); + unsigned NumOperands = VL0->getNumOperands(); + ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands; OpsVec.resize(NumOperands); unsigned NumLanes = VL.size(); for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { @@ -2440,7 +2450,7 @@ public: } /// \returns the number of operands. - unsigned getNumOperands() const { return OpsVec.size(); } + unsigned getNumOperands() const { return ArgSize; } /// \returns the number of lanes. unsigned getNumLanes() const { return OpsVec[0].size(); } @@ -2460,6 +2470,8 @@ public: /// the whole vector (it is mixed with constants or loop invariant values). /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { + assert(Op == getValue(OpIdx, Lane) && + "Op is expected to be getValue(OpIdx, Lane)."); // Small number of loads - try load matching. if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2) return false; @@ -2517,6 +2529,8 @@ public: /// Checks if there is at least single compatible operand in lanes other /// than \p Lane, compatible with the operand \p Op. bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const { + assert(Op == getValue(OpIdx, Lane) && + "Op is expected to be getValue(OpIdx, Lane)."); bool OpAPO = getData(OpIdx, Lane).APO; for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { if (Ln == Lane) @@ -2537,13 +2551,11 @@ public: public: /// Initialize with all the operands of the instruction vector \p RootVL. - VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R) + VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R) : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R), - L(R.LI->getLoopFor( - (cast<Instruction>(*find_if(RootVL, IsaPred<Instruction>)) - ->getParent()))) { + L(R.LI->getLoopFor((VL0->getParent()))) { // Append all the operands of RootVL. - appendOperandsOfVL(RootVL); + appendOperandsOfVL(RootVL, VL0); } /// \Returns a value vector with the operands across all lanes for the @@ -2617,7 +2629,8 @@ public: ArrayRef<OperandData> Op0 = OpsVec.front(); for (const OperandData &Data : Op0) UniqueValues.insert(Data.V); - for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) { + for (ArrayRef<OperandData> Op : + ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) { if (any_of(Op, [&UniqueValues](const OperandData &Data) { return !UniqueValues.contains(Data.V); })) @@ -2920,13 +2933,11 @@ private: /// truncation. We collect the entries that will be demoted in ToDemote. /// \param E Node for analysis /// \param ToDemote indices of the nodes to be demoted. - bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot, - unsigned &BitWidth, - SmallVectorImpl<unsigned> &ToDemote, - DenseSet<const TreeEntry *> &Visited, - unsigned &MaxDepthLevel, - bool &IsProfitableToDemote, - bool IsTruncRoot) const; + bool collectValuesToDemote( + const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth, + SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited, + const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel, + bool &IsProfitableToDemote, bool IsTruncRoot) const; /// Check if the operands on the edges \p Edges of the \p UserTE allows /// reordering (i.e. the operands can be reordered because they have only one @@ -3138,13 +3149,6 @@ private: SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 8> &GatheredLoads); - /// Reorder commutative or alt operands to get better probability of - /// generating vectorized code. - static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right, - const BoUpSLP &R); - /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the /// users of \p TE and collects the stores. It returns the map from the store /// pointers to the collected stores. @@ -3307,7 +3311,7 @@ private: /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from /// other nodes as a series of insertvector instructions. - SmallVector<std::pair<unsigned, unsigned>, 0> CombinedEntriesWithIndices; + SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices; private: /// The operands of each instruction in each lane Operands[op_index][lane]. @@ -3339,27 +3343,13 @@ private: copy(OpVL, Operands[OpIdx].begin()); } - /// Set the operands of this bundle in their original order. - void setOperandsInOrder() { - assert(Operands.empty() && "Already initialized?"); - auto *I0 = cast<Instruction>(*find_if(Scalars, IsaPred<Instruction>)); - Operands.resize(I0->getNumOperands()); - unsigned NumLanes = Scalars.size(); - for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); - OpIdx != NumOperands; ++OpIdx) { - Operands[OpIdx].resize(NumLanes); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - if (isa<PoisonValue>(Scalars[Lane])) { - Operands[OpIdx][Lane] = - PoisonValue::get(I0->getOperand(OpIdx)->getType()); - continue; - } - auto *I = cast<Instruction>(Scalars[Lane]); - assert(I->getNumOperands() == NumOperands && - "Expected same number of operands"); - Operands[OpIdx][Lane] = I->getOperand(OpIdx); - } - } + /// Set this bundle's operand from Scalars. + void setOperand(const BoUpSLP &R, bool RequireReorder = false) { + VLOperands Ops(Scalars, MainOp, R); + if (RequireReorder) + Ops.reorder(); + for (unsigned I : seq<unsigned>(MainOp->getNumOperands())) + setOperand(I, Ops.getVL(I)); } /// Reorders operands of the node to the given mask \p Mask. @@ -3410,8 +3400,8 @@ private: } void setOperations(const InstructionsState &S) { - MainOp = S.MainOp; - AltOp = S.AltOp; + MainOp = S.getMainOp(); + AltOp = S.getAltOp(); } Instruction *getMainOp() const { @@ -3555,6 +3545,13 @@ private: for (const auto &EInfo : UserTreeIndices) dbgs() << EInfo << ", "; dbgs() << "\n"; + if (!CombinedEntriesWithIndices.empty()) { + dbgs() << "Combined entries: "; + interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) { + dbgs() << "Entry index " << P.first << " with offset " << P.second; + }); + dbgs() << "\n"; + } } #endif }; @@ -3649,8 +3646,8 @@ private: } // Update the scheduler bundle to point to this TreeEntry. ScheduleData *BundleMember = *Bundle; - assert((BundleMember || isa<PHINode>(S.MainOp) || - isVectorLikeInstWithConstOps(S.MainOp) || + assert((BundleMember || isa<PHINode>(S.getMainOp()) || + isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL)) && "Bundle and VL out of sync"); if (BundleMember) { @@ -3717,9 +3714,11 @@ private: /// Checks if the specified list of the instructions/values can be vectorized /// and fills required data before actual scheduling of the instructions. - TreeEntry::EntryState getScalarsVectorizationState( - InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE, - OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps); + TreeEntry::EntryState + getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL, + bool IsScatterVectorizeUserTE, + OrdersType &CurrentOrder, + SmallVectorImpl<Value *> &PointerOps); /// Maps a specific scalar to its tree entry. SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry; @@ -4790,8 +4789,10 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) { /// Check if \p Order represents reverse order. static bool isReverseOrder(ArrayRef<unsigned> Order) { + assert(!Order.empty() && + "Order is empty. Please check it before using isReverseOrder."); unsigned Sz = Order.size(); - return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) { + return all_of(enumerate(Order), [&](const auto &Pair) { return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value(); }); } @@ -5642,8 +5643,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { auto PHICompare = [&](unsigned I1, unsigned I2) { Value *V1 = TE.Scalars[I1]; Value *V2 = TE.Scalars[I2]; - if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0) || - isa<PoisonValue>(V1) || isa<PoisonValue>(V2)) + if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0)) + return false; + if (isa<PoisonValue>(V1)) + return true; + if (isa<PoisonValue>(V2)) return false; if (V1->getNumUses() < V2->getNumUses()) return true; @@ -6511,7 +6515,7 @@ void BoUpSLP::buildExternalUses( // be used. if (UseEntry->State == TreeEntry::ScatterVectorize || !doesInTreeUserNeedToExtract( - Scalar, getRootEntryInstruction(*UseEntry), TLI)) { + Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(!UseEntry->isGather() && "Bad state"); @@ -6935,8 +6939,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads( // 2. All users are deleted. // 3. The load broadcasts are not allowed or the load is not // broadcasted. - if (std::distance(LI->user_begin(), LI->user_end()) != - LI->getNumUses()) + if (static_cast<unsigned int>(std::distance( + LI->user_begin(), LI->user_end())) != LI->getNumUses()) return false; if (!IsLegalBroadcastLoad) continue; @@ -7426,17 +7430,17 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, unsigned Opcode1 = S.getAltOpcode(); SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1)); // If this pattern is supported by the target then consider it profitable. - if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()), + if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()), Opcode0, Opcode1, OpcodeMask)) return true; SmallVector<ValueList> Operands; - for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) { + for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) { Operands.emplace_back(); // Prepare the operand vector. for (Value *V : VL) { if (isa<PoisonValue>(V)) { Operands.back().push_back( - PoisonValue::get(S.MainOp->getOperand(I)->getType())); + PoisonValue::get(S.getMainOp()->getOperand(I)->getType())); continue; } Operands.back().push_back(cast<Instruction>(V)->getOperand(I)); @@ -7486,7 +7490,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, ++ExtraShuffleInsts; } } - const Loop *L = LI->getLoopFor(S.MainOp->getParent()); + const Loop *L = LI->getLoopFor(S.getMainOp()->getParent()); // Vectorize node, if: // 1. at least single operand is constant or splat. // 2. Operands have many loop invariants (the instructions are not loop @@ -7496,7 +7500,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, [&](ArrayRef<Value *> Op) { if (allConstant(Op) || (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) && - getSameOpcode(Op, *TLI).MainOp)) + getSameOpcode(Op, *TLI).getMainOp())) return false; DenseMap<Value *, unsigned> Uniques; for (Value *V : Op) { @@ -7528,19 +7532,21 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, // vector operands is number of vector instructions + number of vector // instructions for operands (buildvectors). Number of buildvector // instructions is just number_of_operands * number_of_scalars. - (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() && + (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() && (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts + - NumAltInsts) < S.MainOp->getNumOperands() * VL.size()); + NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size()); } BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( - InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE, - OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) { - assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); + const InstructionsState &S, ArrayRef<Value *> VL, + bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, + SmallVectorImpl<Value *> &PointerOps) { + assert(S.getMainOp() && + "Expected instructions with same/alternate opcodes only."); unsigned ShuffleOrOp = S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); - auto *VL0 = cast<Instruction>(S.OpValue); + Instruction *VL0 = S.getMainOp(); switch (ShuffleOrOp) { case Instruction::PHI: { // Too many operands - gather, most probably won't be vectorized. @@ -7712,7 +7718,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case Instruction::Or: case Instruction::Xor: case Instruction::Freeze: - if (S.MainOp->getType()->isFloatingPointTy() && + if (S.getMainOp()->getType()->isFloatingPointTy() && TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { auto *I = dyn_cast<Instruction>(V); return I && I->isBinaryOp() && !I->isFast(); @@ -7809,7 +7815,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( return TreeEntry::NeedToGather; } case Instruction::Call: { - if (S.MainOp->getType()->isFloatingPointTy() && + if (S.getMainOp()->getType()->isFloatingPointTy() && TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { auto *I = dyn_cast<Instruction>(V); return I && !I->isFast(); @@ -7834,7 +7840,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( unsigned NumArgs = CI->arg_size(); SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr); for (unsigned J = 0; J != NumArgs; ++J) - if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) ScalarArgs[J] = CI->getArgOperand(J); for (Value *V : VL) { CallInst *CI2 = dyn_cast<CallInst>(V); @@ -7850,7 +7856,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( // Some intrinsics have scalar arguments and should be same in order for // them to be vectorized. for (unsigned J = 0; J != NumArgs; ++J) { - if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) { Value *A1J = CI2->getArgOperand(J); if (ScalarArgs[J] != A1J) { LLVM_DEBUG(dbgs() @@ -8035,7 +8041,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return isa<UndefValue>(V) || !isConstant(V); }))) { if (DoNotFail && UniquePositions.size() > 1 && - NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() && + NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() && all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) { // Find the number of elements, which forms full vectors. unsigned PWSz = getFullVectorNumberOfElements( @@ -8065,8 +8071,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Don't go into catchswitch blocks, which can happen with PHIs. // Such blocks can only have PHIs and the catchswitch. There is no // place to insert a shuffle if we need to, so just avoid that issue. - if (S.MainOp && - isa<CatchSwitchInst>(S.MainOp->getParent()->getTerminator())) { + if (S.getMainOp() && + isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) { LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; @@ -8074,10 +8080,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check if this is a duplicate of another entry. if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); + if (TreeEntry *E = getTreeEntry(S.getMainOp())) { + LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() + << ".\n"); if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) { - auto It = MultiNodeScalars.find(S.OpValue); + auto It = MultiNodeScalars.find(S.getMainOp()); if (It != MultiNodeScalars.end()) { auto *TEIt = find_if(It->getSecond(), [&](TreeEntry *ME) { return ME->isSame(VL); }); @@ -8090,7 +8097,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } if (!E) { - if (!doesNotNeedToBeScheduled(S.OpValue)) { + if (!doesNotNeedToBeScheduled(S.getMainOp())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, @@ -8098,8 +8105,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } SmallPtrSet<const TreeEntry *, 4> Nodes; - Nodes.insert(getTreeEntry(S.OpValue)); - for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue)) + Nodes.insert(getTreeEntry(S.getMainOp())); + for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp())) Nodes.insert(E); SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end()); if (any_of(Nodes, [&](const TreeEntry *E) { @@ -8122,7 +8129,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // used to properly draw the graph rather than for the actual // vectorization. E->UserTreeIndices.push_back(UserTreeIdx); - LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue + LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp() << ".\n"); return; } @@ -8133,13 +8140,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // a load), in which case peek through to include it in the tree, without // ballooning over-budget. if (Depth >= RecursionMaxDepth && - !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp && - VL.size() >= 4 && - (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { + !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 && + (match(S.getMainOp(), m_Load(m_Value())) || + all_of(VL, [&S](const Value *I) { return match(I, m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && cast<Instruction>(I)->getOpcode() == - cast<Instruction>(S.MainOp)->getOpcode(); + S.getMainOp()->getOpcode(); })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); if (TryToFindDuplicates(S)) @@ -8151,7 +8158,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Don't handle scalable vectors if (S.getOpcode() == Instruction::ExtractElement && isa<ScalableVectorType>( - cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) { + cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, @@ -8188,7 +8195,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op); })); } - bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp); + bool IsCommutative = + isCommutative(S.getMainOp()) || isCommutative(S.getAltOp()); if ((IsCommutative && std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) || (!IsCommutative && @@ -8198,20 +8206,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates; auto *I1 = cast<Instruction>(VL.front()); auto *I2 = cast<Instruction>(VL.back()); - for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) + for (int Op : seq<int>(S.getMainOp()->getNumOperands())) Candidates.emplace_back().emplace_back(I1->getOperand(Op), I2->getOperand(Op)); if (static_cast<unsigned>(count_if( Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); - })) >= S.MainOp->getNumOperands() / 2) + })) >= S.getMainOp()->getNumOperands() / 2) return false; - if (S.MainOp->getNumOperands() > 2) + if (S.getMainOp()->getNumOperands() > 2) return true; if (IsCommutative) { // Check permuted operands. Candidates.clear(); - for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) + for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op) Candidates.emplace_back().emplace_back(I1->getOperand(Op), I2->getOperand((Op + 1) % E)); if (any_of( @@ -8246,7 +8254,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock; if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) || (isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>( - S.OpValue) && + S.getMainOp()) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); @@ -8313,10 +8321,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check that all of the users of the scalars that we want to vectorize are // schedulable. - auto *VL0 = cast<Instruction>(S.OpValue); + Instruction *VL0 = S.getMainOp(); BB = VL0->getParent(); - if (S.MainOp && + if (S.getMainOp() && (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) || !DT->isReachableFromEntry(BB))) { // Don't go into unreachable blocks. They may contain instructions with @@ -8394,7 +8402,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, TreeEntry *TE = newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices); - LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n"; + TE->dump()); // Keeps the reordered operands to avoid code duplication. PHIHandler Handler(*DT, PH, VL); @@ -8423,13 +8432,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // Insert new order with initial value 0, if it does not exist, // otherwise return the iterator to the existing one. - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndices, CurrentOrder); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices, CurrentOrder); + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry " + "(ExtractValueInst/ExtractElementInst).\n"; + TE->dump()); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. - ValueList Op0; - Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0); + TE->setOperand(*this); return; } case Instruction::InsertElement: { @@ -8457,9 +8467,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CurrentOrder.clear(); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, {}, CurrentOrder); - LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n"; + TE->dump()); - TE->setOperandsInOrder(); + TE->setOperand(*this); buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1}); return; } @@ -8477,30 +8488,36 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndices, CurrentOrder, InterleaveFactor); if (CurrentOrder.empty()) - LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n"; + TE->dump()); else - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); - TE->setOperandsInOrder(); + LLVM_DEBUG(dbgs() + << "SLP: added a new TreeEntry (jumbled LoadInst).\n"; + TE->dump()); break; case TreeEntry::StridedVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndices, CurrentOrder); - TE->setOperandsInOrder(); - LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n"; + TE->dump()); break; case TreeEntry::ScatterVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndices); - TE->setOperandsInOrder(); - buildTree_rec(PointerOps, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + LLVM_DEBUG( + dbgs() + << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n"; + TE->dump()); break; case TreeEntry::CombinedVectorize: case TreeEntry::NeedToGather: llvm_unreachable("Unexpected loads state."); } + TE->setOperand(*this); + if (State == TreeEntry::ScatterVectorize) + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); return; } case Instruction::ZExt: @@ -8536,10 +8553,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); - LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n"; + TE->dump()); - TE->setOperandsInOrder(); - for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) + TE->setOperand(*this); + for (unsigned I : seq<unsigned>(VL0->getNumOperands())) buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); if (ShuffleOrOp == Instruction::Trunc) { ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx); @@ -8563,15 +8581,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); - LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n"; + TE->dump()); ValueList Left, Right; + VLOperands Ops(VL, VL0, *this); if (cast<CmpInst>(VL0)->isCommutative()) { // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == CmpInst::getSwappedPredicate(P0) && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *this); + Ops.reorder(); + Left = Ops.getVL(0); + Right = Ops.getVL(1); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -8630,29 +8652,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::Freeze: { TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); - LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); - - // Sort operands of the instructions so that each side is more likely to - // have the same opcode. - if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) { - ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *this); - TE->setOperand(0, Left); - TE->setOperand(1, Right); - buildTree_rec(Left, Depth + 1, {TE, 0}); - buildTree_rec(Right, Depth + 1, {TE, 1}); - return; - } + LLVM_DEBUG( + dbgs() << "SLP: added a new TreeEntry " + "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n"; + TE->dump()); - TE->setOperandsInOrder(); - for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) + TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0)); + for (unsigned I : seq<unsigned>(VL0->getNumOperands())) buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); return; } case Instruction::GetElementPtr: { TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); - LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n"; + TE->dump()); SmallVector<ValueList, 2> Operands(2); // Prepare the operand vector for pointer operands. for (Value *V : VL) { @@ -8710,12 +8724,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, fixupOrderingIndices(CurrentOrder); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndices, CurrentOrder); - TE->setOperandsInOrder(); - buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0}); if (Consecutive) - LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n"; + TE->dump()); else - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); + LLVM_DEBUG( + dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n"; + TE->dump()); + TE->setOperand(*this); + buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0}); return; } case Instruction::Call: { @@ -8726,93 +8743,64 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); - // Sort operands of the instructions so that each side is more likely to - // have the same opcode. - if (isCommutative(VL0)) { - ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *this); - TE->setOperand(0, Left); - TE->setOperand(1, Right); - SmallVector<ValueList> Operands; - for (unsigned I : seq<unsigned>(2, CI->arg_size())) { - Operands.emplace_back(); - if (isVectorIntrinsicWithScalarOpAtArg(ID, I)) - continue; - for (Value *V : VL) { - auto *CI2 = cast<CallInst>(V); - Operands.back().push_back(CI2->getArgOperand(I)); - } - TE->setOperand(I, Operands.back()); - } - buildTree_rec(Left, Depth + 1, {TE, 0}); - buildTree_rec(Right, Depth + 1, {TE, 1}); - for (unsigned I : seq<unsigned>(2, CI->arg_size())) { - if (Operands[I - 2].empty()) - continue; - buildTree_rec(Operands[I - 2], Depth + 1, {TE, I}); - } - return; - } - TE->setOperandsInOrder(); - for (unsigned I : seq<unsigned>(0, CI->arg_size())) { + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n"; + TE->dump()); + TE->setOperand(*this, isCommutative(VL0)); + for (unsigned I : seq<unsigned>(CI->arg_size())) { // For scalar operands no need to create an entry since no need to // vectorize it. - if (isVectorIntrinsicWithScalarOpAtArg(ID, I)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) continue; - ValueList Operands; - // Prepare the operand vector. - for (Value *V : VL) { - auto *CI2 = cast<CallInst>(V); - Operands.push_back(CI2->getArgOperand(I)); - } - buildTree_rec(Operands, Depth + 1, {TE, I}); + buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); } return; } case Instruction::ShuffleVector: { TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); - LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + if (S.isAltShuffle()) { + LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n"; + TE->dump()); + } else { + assert(SLPReVec && "Only supported by REVEC."); + LLVM_DEBUG( + dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n"; + TE->dump()); + } // Reorder operands if reordering would enable vectorization. auto *CI = dyn_cast<CmpInst>(VL0); - if (isa<BinaryOperator>(VL0) || CI) { + if (CI && any_of(VL, [](Value *V) { + return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative(); + })) { + auto *MainCI = cast<CmpInst>(S.getMainOp()); + auto *AltCI = cast<CmpInst>(S.getAltOp()); + CmpInst::Predicate MainP = MainCI->getPredicate(); + CmpInst::Predicate AltP = AltCI->getPredicate(); + assert(MainP != AltP && + "Expected different main/alternate predicates."); ValueList Left, Right; - if (!CI || all_of(VL, [](Value *V) { - return isa<PoisonValue>(V) || cast<CmpInst>(V)->isCommutative(); - })) { - reorderInputsAccordingToOpcode(VL, Left, Right, *this); - } else { - auto *MainCI = cast<CmpInst>(S.MainOp); - auto *AltCI = cast<CmpInst>(S.AltOp); - CmpInst::Predicate MainP = MainCI->getPredicate(); - CmpInst::Predicate AltP = AltCI->getPredicate(); - assert(MainP != AltP && - "Expected different main/alternate predicates."); - // Collect operands - commute if it uses the swapped predicate or - // alternate operation. - for (Value *V : VL) { - if (isa<PoisonValue>(V)) { - Left.push_back( - PoisonValue::get(MainCI->getOperand(0)->getType())); - Right.push_back( - PoisonValue::get(MainCI->getOperand(1)->getType())); - continue; - } - auto *Cmp = cast<CmpInst>(V); - Value *LHS = Cmp->getOperand(0); - Value *RHS = Cmp->getOperand(1); + // Collect operands - commute if it uses the swapped predicate or + // alternate operation. + for (Value *V : VL) { + if (isa<PoisonValue>(V)) { + Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType())); + Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType())); + continue; + } + auto *Cmp = cast<CmpInst>(V); + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); - if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) { - if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) - std::swap(LHS, RHS); - } else { - if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) - std::swap(LHS, RHS); - } - Left.push_back(LHS); - Right.push_back(RHS); + if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) { + if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) + std::swap(LHS, RHS); + } else { + if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) + std::swap(LHS, RHS); } + Left.push_back(LHS); + Right.push_back(RHS); } TE->setOperand(0, Left); TE->setOperand(1, Right); @@ -8821,8 +8809,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } - TE->setOperandsInOrder(); - for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) + TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI); + for (unsigned I : seq<unsigned>(VL0->getNumOperands())) buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); return; } @@ -9707,7 +9695,7 @@ void BoUpSLP::transformNodes() { auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2, const InstructionsState &S) { SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates; - for (unsigned Op : seq<unsigned>(S.MainOp->getNumOperands())) + for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands())) Candidates.emplace_back().emplace_back(I1->getOperand(Op), I2->getOperand(Op)); return all_of( @@ -9778,7 +9766,8 @@ void BoUpSLP::transformNodes() { Slice.front()->getType(), 2 * VF)), 1U, 2 * VF)) || count(Slice, Slice.front()) == - (isa<UndefValue>(Slice.front()) ? VF - 1 : 1)) { + static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1 + : 1)) { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); @@ -9791,7 +9780,7 @@ void BoUpSLP::transformNodes() { // Try to vectorize reduced values or if all users are vectorized. // For expensive instructions extra extracts might be profitable. if ((!UserIgnoreList || E.Idx != 0) && - TTI->getInstructionCost(S.MainOp, CostKind) < + TTI->getInstructionCost(S.getMainOp(), CostKind) < TTI::TCC_Expensive && !all_of(Slice, [&](Value *V) { if (isa<PoisonValue>(V)) @@ -9818,10 +9807,10 @@ void BoUpSLP::transformNodes() { continue; } } else if (S.getOpcode() == Instruction::ExtractElement || - (TTI->getInstructionCost(S.MainOp, CostKind) < + (TTI->getInstructionCost(S.getMainOp(), CostKind) < TTI::TCC_Expensive && !CheckOperandsProfitability( - S.MainOp, + S.getMainOp(), cast<Instruction>(*find_if(reverse(Slice), IsaPred<Instruction>)), S))) { @@ -9891,7 +9880,7 @@ void BoUpSLP::transformNodes() { Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars); // Check if profitable to represent consecutive load + reverse as strided // load with stride -1. - if (isReverseOrder(E.ReorderIndices) && + if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) { SmallVector<int> Mask; inversePermutation(E.ReorderIndices, Mask); @@ -9918,7 +9907,7 @@ void BoUpSLP::transformNodes() { Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars); // Check if profitable to represent consecutive load + reverse as strided // load with stride -1. - if (isReverseOrder(E.ReorderIndices) && + if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) { SmallVector<int> Mask; inversePermutation(E.ReorderIndices, Mask); @@ -10272,9 +10261,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { // sub-Mask into the CommonMask to estimate it later and avoid double cost // estimation. if ((InVectors.size() == 2 && - InVectors.front().get<const TreeEntry *>() == &E1 && - InVectors.back().get<const TreeEntry *>() == E2) || - (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) { + cast<const TreeEntry *>(InVectors.front()) == &E1 && + cast<const TreeEntry *>(InVectors.back()) == E2) || + (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) { unsigned Limit = getNumElems(Mask.size(), SliceSize, Part); assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit), [](int Idx) { return Idx == PoisonMaskElem; }) && @@ -10300,7 +10289,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { VF = std::max(VF, cast<FixedVectorType>(V1->getType())->getNumElements()); } else { - const auto *E = InVectors.front().get<const TreeEntry *>(); + const auto *E = cast<const TreeEntry *>(InVectors.front()); VF = std::max(VF, E->getVectorFactor()); } for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) @@ -10316,7 +10305,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { VF = std::max(VF, getNumElements(V1->getType())); } else { - const auto *E = P.get<const TreeEntry *>(); + const auto *E = cast<const TreeEntry *>(P); VF = std::max(VF, E->getVectorFactor()); } for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) @@ -10422,9 +10411,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }; if (!V1 && !V2 && !P2.isNull()) { // Shuffle 2 entry nodes. - const TreeEntry *E = P1.get<const TreeEntry *>(); + const TreeEntry *E = cast<const TreeEntry *>(P1); unsigned VF = E->getVectorFactor(); - const TreeEntry *E2 = P2.get<const TreeEntry *>(); + const TreeEntry *E2 = cast<const TreeEntry *>(P2); CommonVF = std::max(VF, E2->getVectorFactor()); assert(all_of(Mask, [=](int Idx) { @@ -10456,7 +10445,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); } else if (!V1 && P2.isNull()) { // Shuffle single entry node. - const TreeEntry *E = P1.get<const TreeEntry *>(); + const TreeEntry *E = cast<const TreeEntry *>(P1); unsigned VF = E->getVectorFactor(); CommonVF = VF; assert( @@ -10505,7 +10494,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } else if (V1 && !V2) { // Shuffle vector and tree node. unsigned VF = getVF(V1); - const TreeEntry *E2 = P2.get<const TreeEntry *>(); + const TreeEntry *E2 = cast<const TreeEntry *>(P2); CommonVF = std::max(VF, E2->getVectorFactor()); assert(all_of(Mask, [=](int Idx) { @@ -10531,7 +10520,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } else if (!V1 && V2) { // Shuffle vector and tree node. unsigned VF = getVF(V2); - const TreeEntry *E1 = P1.get<const TreeEntry *>(); + const TreeEntry *E1 = cast<const TreeEntry *>(P1); CommonVF = std::max(VF, E1->getVectorFactor()); assert(all_of(Mask, [=](int Idx) { @@ -10769,8 +10758,8 @@ public: if (P.value() == PoisonMaskElem) return Mask[P.index()] == PoisonMaskElem; auto *EI = cast<ExtractElementInst>( - InVectors.front().get<const TreeEntry *>()->getOrdered( - P.index())); + cast<const TreeEntry *>(InVectors.front()) + ->getOrdered(P.index())); return EI->getVectorOperand() == V1 || EI->getVectorOperand() == V2; }) && @@ -10787,23 +10776,21 @@ public: } if (ForExtracts) { // No need to add vectors here, already handled them in adjustExtracts. - assert( - InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() && - !CommonMask.empty() && - all_of(enumerate(CommonMask), - [&](auto P) { - Value *Scalar = - InVectors.front().get<const TreeEntry *>()->getOrdered( - P.index()); - if (P.value() == PoisonMaskElem) - return P.value() == Mask[P.index()] || - isa<UndefValue>(Scalar); - if (isa<Constant>(V1)) - return true; - auto *EI = cast<ExtractElementInst>(Scalar); - return EI->getVectorOperand() == V1; - }) && - "Expected only tree entry for extractelement vectors."); + assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) && + !CommonMask.empty() && + all_of(enumerate(CommonMask), + [&](auto P) { + Value *Scalar = cast<const TreeEntry *>(InVectors[0]) + ->getOrdered(P.index()); + if (P.value() == PoisonMaskElem) + return P.value() == Mask[P.index()] || + isa<UndefValue>(Scalar); + if (isa<Constant>(V1)) + return true; + auto *EI = cast<ExtractElementInst>(Scalar); + return EI->getVectorOperand() == V1; + }) && + "Expected only tree entry for extractelement vectors."); return; } assert(!InVectors.empty() && !CommonMask.empty() && @@ -10818,7 +10805,7 @@ public: VF = std::max(VF, InTE->getVectorFactor()); } else { VF = std::max( - VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType()) + VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType()) ->getNumElements()); } InVectors.push_back(V1); @@ -10888,7 +10875,7 @@ public: CommonMask[Idx] = Idx; assert(VF > 0 && "Expected vector length for the final value before action."); - Value *V = Vec.get<Value *>(); + Value *V = cast<Value *>(Vec); Action(V, CommonMask); InVectors.front() = V; } @@ -10998,14 +10985,14 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { /// Builds the arguments types vector for the given call instruction with the /// given \p ID for the specified vector factor. -static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI, - const Intrinsic::ID ID, - const unsigned VF, - unsigned MinBW) { +static SmallVector<Type *> +buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, + const unsigned VF, unsigned MinBW, + const TargetTransformInfo *TTI) { SmallVector<Type *> ArgTys; for (auto [Idx, Arg] : enumerate(CI->args())) { if (ID != Intrinsic::not_intrinsic) { - if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) { ArgTys.push_back(Arg->getType()); continue; } @@ -11044,7 +11031,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, unsigned EntryVF = E->getVectorFactor(); auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF); - bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->isGather()) { if (allConstant(VL)) return 0; @@ -11057,9 +11043,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, } InstructionCost CommonCost = 0; SmallVector<int> Mask; - bool IsReverseOrder = isReverseOrder(E->ReorderIndices); - if (!E->ReorderIndices.empty() && - (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) { + if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize || + !isReverseOrder(E->ReorderIndices))) { SmallVector<int> NewMask; if (E->getOpcode() == Instruction::Store) { // For stores the order is actually a mask. @@ -11070,7 +11055,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, } ::addMask(Mask, NewMask); } - if (NeedToShuffleReuses) + if (!E->ReuseShuffleIndices.empty()) ::addMask(Mask, E->ReuseShuffleIndices); if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) CommonCost = @@ -11458,7 +11443,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: { - CmpInst::Predicate VecPred, SwappedVecPred; + CmpPredicate VecPred, SwappedVecPred; auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value()); if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) || match(VL0, MatchCmp)) @@ -11472,13 +11457,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, return InstructionCost(TTI::TCC_Free); auto *VI = cast<Instruction>(UniqueValues[Idx]); - CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy() - ? CmpInst::BAD_FCMP_PREDICATE - : CmpInst::BAD_ICMP_PREDICATE; + CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy() + ? CmpInst::BAD_FCMP_PREDICATE + : CmpInst::BAD_ICMP_PREDICATE; auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); + // FIXME: Use CmpPredicate::getMatching here. if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) && !match(VI, MatchCmp)) || - (CurrentPred != VecPred && CurrentPred != SwappedVecPred)) + (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) && + CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred))) VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy() ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; @@ -11707,9 +11694,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, auto GetVectorCost = [=](InstructionCost CommonCost) { auto *CI = cast<CallInst>(VL0); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - SmallVector<Type *> ArgTys = - buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(), - It != MinBWs.end() ? It->second.first : 0); + SmallVector<Type *> ArgTys = buildIntrinsicArgTypes( + CI, ID, VecTy->getNumElements(), + It != MinBWs.end() ? It->second.first : 0, TTI); auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost; }; @@ -11894,7 +11881,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { TE->Scalars.size() < Limit || ((TE->getOpcode() == Instruction::ExtractElement || all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) && - isFixedVectorShuffle(TE->Scalars, Mask)) || + isFixedVectorShuffle(TE->Scalars, Mask, AC)) || (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) || any_of(TE->Scalars, IsaPred<LoadInst>)); }; @@ -12959,7 +12946,7 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements( // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. std::optional<TTI::ShuffleKind> Res = - isFixedVectorShuffle(GatheredExtracts, Mask); + isFixedVectorShuffle(GatheredExtracts, Mask, AC); if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) { // TODO: try to check other subsets if possible. // Restore the original VL if attempt was not successful. @@ -13209,14 +13196,15 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( copy(CommonMask, Mask.begin()); } // Clear undef scalars. - for (int I = 0, Sz = VL.size(); I < Sz; ++I) + for (unsigned I : seq<unsigned>(VL.size())) if (isa<PoisonValue>(VL[I])) - Mask[I] = PoisonMaskElem; + Mask[Part * VL.size() + I] = PoisonMaskElem; return TargetTransformInfo::SK_PermuteSingleSrc; } // No perfect match, just shuffle, so choose the first tree node from the // tree. Entries.push_back(FirstEntries.front()); + VF = FirstEntries.front()->getVectorFactor(); } else { // Try to find nodes with the same vector factor. assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); @@ -13257,6 +13245,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( Entries.push_back(SecondEntries.front()); VF = std::max(Entries.front()->getVectorFactor(), Entries.back()->getVectorFactor()); + } else { + VF = Entries.front()->getVectorFactor(); } } @@ -13368,17 +13358,141 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( : Entries[Pair.first]->findLaneForValue(VL[Pair.second])); IsIdentity &= Mask[Idx] == Pair.second; } - switch (Entries.size()) { - case 1: - if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) - return TargetTransformInfo::SK_PermuteSingleSrc; - break; - case 2: - if (EntryLanes.size() > 2 || VL.size() <= 2) - return TargetTransformInfo::SK_PermuteTwoSrc; - break; - default: - break; + if (ForOrder || IsIdentity || Entries.empty()) { + switch (Entries.size()) { + case 1: + if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteSingleSrc; + break; + case 2: + if (EntryLanes.size() > 2 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteTwoSrc; + break; + default: + break; + } + } else if (!isa<VectorType>(VL.front()->getType()) && + (EntryLanes.size() > Entries.size() || VL.size() <= 2)) { + // Do the cost estimation if shuffle beneficial than buildvector. + SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()), + std::next(Mask.begin(), (Part + 1) * VL.size())); + int MinElement = SubMask.front(), MaxElement = SubMask.front(); + for (int Idx : SubMask) { + if (Idx == PoisonMaskElem) + continue; + if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF) + MinElement = Idx; + if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF) + MaxElement = Idx; + } + assert(MaxElement >= 0 && MinElement >= 0 && + MaxElement % VF >= MinElement % VF && + "Expected at least single element."); + unsigned NewVF = std::max<unsigned>( + VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(), + (MaxElement % VF) - + (MinElement % VF) + 1)); + if (NewVF < VF) { + for_each(SubMask, [&](int &Idx) { + if (Idx == PoisonMaskElem) + return; + Idx = (Idx % VF) - (MinElement % VF) + + (Idx >= static_cast<int>(VF) ? NewVF : 0); + }); + VF = NewVF; + } + + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + auto *VecTy = getWidenedType(VL.front()->getType(), VF); + auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size()); + auto GetShuffleCost = [&, + &TTI = *TTI](ArrayRef<int> Mask, + ArrayRef<const TreeEntry *> Entries, + VectorType *VecTy) -> InstructionCost { + if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 && + ShuffleVectorInst::isDeInterleaveMaskOfFactor( + Mask, Entries.front()->getInterleaveFactor())) + return TTI::TCC_Free; + return ::getShuffleCost(TTI, + Entries.size() > 1 ? TTI::SK_PermuteTwoSrc + : TTI::SK_PermuteSingleSrc, + VecTy, Mask, CostKind); + }; + InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy); + InstructionCost FirstShuffleCost = 0; + SmallVector<int> FirstMask(SubMask.begin(), SubMask.end()); + if (Entries.size() == 1 || !Entries[0]->isGather()) { + FirstShuffleCost = ShuffleCost; + } else { + // Transform mask to include only first entry. + APInt DemandedElts = APInt::getAllOnes(SubMask.size()); + bool IsIdentity = true; + for (auto [I, Idx] : enumerate(FirstMask)) { + if (Idx >= static_cast<int>(VF)) { + Idx = PoisonMaskElem; + } else { + DemandedElts.clearBit(I); + if (Idx != PoisonMaskElem) + IsIdentity &= static_cast<int>(I) == Idx; + } + } + if (!IsIdentity) + FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy); + FirstShuffleCost += TTI->getScalarizationOverhead( + MaskVecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + } + InstructionCost SecondShuffleCost = 0; + SmallVector<int> SecondMask(SubMask.begin(), SubMask.end()); + if (Entries.size() == 1 || !Entries[1]->isGather()) { + SecondShuffleCost = ShuffleCost; + } else { + // Transform mask to include only first entry. + APInt DemandedElts = APInt::getAllOnes(SubMask.size()); + bool IsIdentity = true; + for (auto [I, Idx] : enumerate(SecondMask)) { + if (Idx < static_cast<int>(VF) && Idx >= 0) { + Idx = PoisonMaskElem; + } else { + DemandedElts.clearBit(I); + if (Idx != PoisonMaskElem) { + Idx -= VF; + IsIdentity &= static_cast<int>(I) == Idx; + } + } + } + if (!IsIdentity) + SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy); + SecondShuffleCost += TTI->getScalarizationOverhead( + MaskVecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + } + APInt DemandedElts = APInt::getAllOnes(SubMask.size()); + for (auto [I, Idx] : enumerate(SubMask)) + if (Idx == PoisonMaskElem) + DemandedElts.clearBit(I); + InstructionCost BuildVectorCost = + TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + const TreeEntry *BestEntry = nullptr; + if (FirstShuffleCost < ShuffleCost) { + copy(FirstMask, std::next(Mask.begin(), Part * VL.size())); + BestEntry = Entries.front(); + ShuffleCost = FirstShuffleCost; + } + if (SecondShuffleCost < ShuffleCost) { + copy(SecondMask, std::next(Mask.begin(), Part * VL.size())); + BestEntry = Entries[1]; + ShuffleCost = SecondShuffleCost; + } + if (BuildVectorCost >= ShuffleCost) { + if (BestEntry) { + Entries.clear(); + Entries.push_back(BestEntry); + } + return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc + : TargetTransformInfo::SK_PermuteSingleSrc; + } } Entries.clear(); // Clear the corresponding mask elements. @@ -13526,21 +13640,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc, return Cost; } -// Perform operand reordering on the instructions in VL and return the reordered -// operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right, - const BoUpSLP &R) { - if (VL.empty()) - return; - VLOperands Ops(VL, R); - // Reorder the operands in place. - Ops.reorder(); - Left = Ops.getVL(0); - Right = Ops.getVL(1); -} - Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { auto &Res = EntryToLastInstruction.try_emplace(E).first->second; if (Res) @@ -14481,10 +14580,10 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, VE->isSame(TE->Scalars); })); }; - TreeEntry *VE = getTreeEntry(S.OpValue); + TreeEntry *VE = getTreeEntry(S.getMainOp()); if (VE && CheckSameVE(VE)) return VE; - auto It = MultiNodeScalars.find(S.OpValue); + auto It = MultiNodeScalars.find(S.getMainOp()); if (It != MultiNodeScalars.end()) { auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) { return TE != VE && CheckSameVE(TE); @@ -14862,7 +14961,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, // non-poisonous, or by freezing the incoming scalar value first. auto *It = find_if(Scalars, [this, E](Value *V) { return !isa<UndefValue>(V) && - (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || + (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) || (E->UserTreeIndices.size() == 1 && any_of(V->uses(), [E](const Use &U) { // Check if the value already used in the same operation in @@ -14934,11 +15033,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } if (Vec2) { IsUsedInExpr = false; - IsNonPoisoned &= - isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); + IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) && + isGuaranteedNotToBePoison(Vec2, AC); ShuffleBuilder.add(Vec1, Vec2, ExtractMask); } else if (Vec1) { - bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1); + bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC); IsUsedInExpr &= FindReusedSplat( ExtractMask, cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0, @@ -14969,7 +15068,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, if (TEs.size() == 1) { bool IsNotPoisonedVec = TEs.front()->VectorizedValue - ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) + ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) : true; IsUsedInExpr &= FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, @@ -14981,8 +15080,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask); if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue) IsNonPoisoned &= - isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) && - isGuaranteedNotToBePoison(TEs.back()->VectorizedValue); + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) && + isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC); } } } @@ -15133,7 +15232,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { return Vec; } - bool IsReverseOrder = isReverseOrder(E->ReorderIndices); + bool IsReverseOrder = + !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices); auto FinalShuffle = [&](Value *V, const TreeEntry *E) { ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this); if (E->getOpcode() == Instruction::Store && @@ -15316,7 +15416,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } if (!IsIdentity || NumElts != NumScalars) { Value *V2 = nullptr; - bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V); + bool IsVNonPoisonous = + !isConstant(V) && isGuaranteedNotToBePoison(V, AC); SmallVector<int> InsertMask(Mask); if (NumElts != NumScalars && Offset == 0) { // Follow all insert element instructions from the current buildvector @@ -15519,6 +15620,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); + if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end()) + ICmp->setSameSign(/*B=*/false); // Do not cast for cmps. VecTy = cast<FixedVectorType>(V->getType()); V = FinalShuffle(V, E); @@ -15881,9 +15984,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - SmallVector<Type *> ArgTys = - buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(), - It != MinBWs.end() ? It->second.first : 0); + SmallVector<Type *> ArgTys = buildIntrinsicArgTypes( + CI, ID, VecTy->getNumElements(), + It != MinBWs.end() ? It->second.first : 0, TTI); auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); bool UseIntrinsic = ID != Intrinsic::not_intrinsic && VecCallCosts.first <= VecCallCosts.second; @@ -15899,7 +16002,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. - if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) { + if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) { ScalarArg = CEI->getArgOperand(I); // if decided to reduce bitwidth of abs intrinsic, it second argument // must be set false (do not return poison, if value issigned min). @@ -16214,6 +16317,11 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, } Builder.SetCurrentDebugLocation(UserI->getDebugLoc()); Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false); + if (auto *VecI = dyn_cast<Instruction>(Vec); + VecI && VecI->getParent() == Builder.GetInsertBlock() && + Builder.GetInsertPoint()->comesBefore(VecI)) + VecI->moveBeforePreserving(*Builder.GetInsertBlock(), + Builder.GetInsertPoint()); if (Vec->getType() != PrevVec->getType()) { assert(Vec->getType()->isIntOrIntVectorTy() && PrevVec->getType()->isIntOrIntVectorTy() && @@ -16433,7 +16541,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, E->State == TreeEntry::StridedVectorize) && doesInTreeUserNeedToExtract( Scalar, getRootEntryInstruction(*UseEntry), - TLI); + TLI, TTI); })) && "Scalar with nullptr User must be registered in " "ExternallyUsedValues map or remain as scalar in vectorized " @@ -16966,13 +17074,13 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, const InstructionsState &S) { // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. - if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) || - doesNotNeedToSchedule(VL)) + if (isa<PHINode>(S.getMainOp()) || + isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL)) return nullptr; // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; - LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); + LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n"); auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule, ScheduleData *Bundle) { @@ -17053,7 +17161,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, auto *Bundle = buildBundle(VL); TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle->isReady()) { - cancelScheduling(VL, S.OpValue); + cancelScheduling(VL, S.getMainOp()); return std::nullopt; } return Bundle; @@ -17574,8 +17682,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { bool BoUpSLP::collectValuesToDemote( const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth, SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited, - unsigned &MaxDepthLevel, bool &IsProfitableToDemote, - bool IsTruncRoot) const { + const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel, + bool &IsProfitableToDemote, bool IsTruncRoot) const { // We can always demote constants. if (all_of(E.Scalars, IsaPred<Constant>)) return true; @@ -17587,6 +17695,10 @@ bool BoUpSLP::collectValuesToDemote( return true; } + // Check if the node was analyzed already and must keep its original bitwidth. + if (NodesToKeepBWs.contains(E.Idx)) + return false; + // If the value is not a vectorized instruction in the expression and not used // by the insertelement instruction and not used in multiple vector nodes, it // cannot be demoted. @@ -17682,8 +17794,8 @@ bool BoUpSLP::collectValuesToDemote( for (const TreeEntry *Op : Operands) { unsigned Level = InitLevel; if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth, - ToDemote, Visited, Level, IsProfitableToDemote, - IsTruncRoot)) { + ToDemote, Visited, NodesToKeepBWs, Level, + IsProfitableToDemote, IsTruncRoot)) { if (!IsProfitableToDemote) return false; NeedToExit = true; @@ -17929,7 +18041,8 @@ bool BoUpSLP::collectValuesToDemote( // Choose the best bitwidth based on cost estimations. auto Checker = [&](unsigned BitWidth, unsigned) { unsigned MinBW = PowerOf2Ceil(BitWidth); - SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW); + SmallVector<Type *> ArgTys = + buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI); auto VecCallCosts = getVectorCallCosts( IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF), TTI, TLI, ArgTys); @@ -17985,6 +18098,7 @@ void BoUpSLP::computeMinimumValueSizes() { bool IsTruncRoot = false; bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt; SmallVector<unsigned> RootDemotes; + SmallDenseSet<unsigned, 8> NodesToKeepBWs; if (NodeIdx != 0 && VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize && VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) { @@ -18008,6 +18122,7 @@ void BoUpSLP::computeMinimumValueSizes() { // Check if the root is trunc and the next node is gather/buildvector, then // keep trunc in scalars, which is free in most cases. if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && + !NodesToKeepBWs.contains(E.Idx) && E.Idx > (IsStoreOrInsertElt ? 2u : 1u) && all_of(E.Scalars, [&](Value *V) { return V->hasOneUse() || isa<Constant>(V) || @@ -18130,8 +18245,8 @@ void BoUpSLP::computeMinimumValueSizes() { bool NeedToDemote = IsProfitableToDemote; if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth, - ToDemote, Visited, MaxDepthLevel, NeedToDemote, - IsTruncRoot) || + ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel, + NeedToDemote, IsTruncRoot) || (MaxDepthLevel <= Limit && !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) || @@ -18265,7 +18380,7 @@ void BoUpSLP::computeMinimumValueSizes() { }); } - // If the maximum bit width we compute is less than the with of the roots' + // If the maximum bit width we compute is less than the width of the roots' // type, we can proceed with the narrowing. Otherwise, do nothing. if (MaxBitWidth == 0 || MaxBitWidth >= @@ -18273,6 +18388,7 @@ void BoUpSLP::computeMinimumValueSizes() { ->getBitWidth()) { if (UserIgnoreList) AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end()); + NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end()); continue; } @@ -18432,7 +18548,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1)); if ((!IsAllowedSize && S.getOpcode() && S.getOpcode() != Instruction::Load && - (!S.MainOp->isSafeToRemove() || + (!S.getMainOp()->isSafeToRemove() || any_of(ValOps.getArrayRef(), [&](Value *V) { return !isa<ExtractElementInst>(V) && @@ -18969,7 +19085,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (!S.getOpcode()) return false; - Instruction *I0 = cast<Instruction>(S.OpValue); + Instruction *I0 = S.getMainOp(); // Make sure invalid types (including vector type) are rejected before // determining vectorization factor for scalar instructions. for (Value *V : VL) { @@ -19381,7 +19497,7 @@ public: // %3 = extractelement <2 x i32> %a, i32 0 // %4 = extractelement <2 x i32> %a, i32 1 // %select = select i1 %cond, i32 %3, i32 %4 - CmpInst::Predicate Pred; + CmpPredicate Pred; Instruction *L1; Instruction *L2; @@ -19656,7 +19772,7 @@ public: /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, - const TargetLibraryInfo &TLI) { + const TargetLibraryInfo &TLI, AssumptionCache *AC) { const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4; constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; @@ -19700,20 +19816,35 @@ public: return cast<Instruction>(ScalarCond); }; + bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) { + return isBoolLogicOp(cast<Instruction>(V)); + }); // Return new VectorizedTree, based on previous value. auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) { if (VectorizedTree) { // Update the final value in the reduction. Builder.SetCurrentDebugLocation( cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); - if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) || - (isGuaranteedNotToBePoison(Res) && - !isGuaranteedNotToBePoison(VectorizedTree))) { - auto It = ReducedValsToOps.find(Res); - if (It != ReducedValsToOps.end() && - any_of(It->getSecond(), - [](Instruction *I) { return isBoolLogicOp(I); })) + if (AnyBoolLogicOp) { + auto It = ReducedValsToOps.find(VectorizedTree); + auto It1 = ReducedValsToOps.find(Res); + if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) || + isGuaranteedNotToBePoison(VectorizedTree, AC) || + (It != ReducedValsToOps.end() && + any_of(It->getSecond(), [&](Instruction *I) { + return isBoolLogicOp(I) && + getRdxOperand(I, 0) == VectorizedTree; + }))) { + ; + } else if (isGuaranteedNotToBePoison(Res, AC) || + (It1 != ReducedValsToOps.end() && + any_of(It1->getSecond(), [&](Instruction *I) { + return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res; + }))) { std::swap(VectorizedTree, Res); + } else { + VectorizedTree = Builder.CreateFreeze(VectorizedTree); + } } return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx", @@ -19722,9 +19853,6 @@ public: // Initialize the final value in the reduction. return Res; }; - bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) { - return isBoolLogicOp(cast<Instruction>(V)); - }); SmallDenseSet<Value *> IgnoreList(ReductionOps.size() * ReductionOps.front().size()); for (ReductionOpsType &RdxOps : ReductionOps) @@ -19801,7 +19929,7 @@ public: TrackedToOrig.try_emplace(RdxVal, RV); } SmallVector<int> Mask; - if (isFixedVectorShuffle(CommonCandidates, Mask)) { + if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) { ++I; Candidates.swap(CommonCandidates); ShuffledExtracts = true; @@ -20116,7 +20244,7 @@ public: // To prevent poison from leaking across what used to be sequential, // safe, scalar boolean logic operations, the reduction operand must be // frozen. - if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot)) + if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC)) VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); // Emit code to correctly handle reused reduced values, if required. @@ -20223,13 +20351,13 @@ public: bool InitStep) { if (!AnyBoolLogicOp) return; - if (isBoolLogicOp(RedOp1) && - ((!InitStep && LHS == VectorizedTree) || - getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS))) + if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) || + getRdxOperand(RedOp1, 0) == LHS || + isGuaranteedNotToBePoison(LHS, AC))) return; if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) || getRdxOperand(RedOp2, 0) == RHS || - isGuaranteedNotToBePoison(RHS))) { + isGuaranteedNotToBePoison(RHS, AC))) { std::swap(LHS, RHS); return; } @@ -20515,6 +20643,8 @@ private: case RecurKind::FMulAdd: case RecurKind::IAnyOf: case RecurKind::FAnyOf: + case RecurKind::IFindLastIV: + case RecurKind::FFindLastIV: case RecurKind::None: llvm_unreachable("Unexpected reduction kind for repeated scalar."); } @@ -20612,6 +20742,8 @@ private: case RecurKind::FMulAdd: case RecurKind::IAnyOf: case RecurKind::FAnyOf: + case RecurKind::IFindLastIV: + case RecurKind::FFindLastIV: case RecurKind::None: llvm_unreachable("Unexpected reduction kind for reused scalars."); } @@ -20873,7 +21005,7 @@ bool SLPVectorizerPass::vectorizeHorReduction( HorizontalReduction HorRdx; if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) return nullptr; - return HorRdx.tryToReduce(R, *DL, TTI, *TLI); + return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); }; auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { if (TryOperandsAsNewSeeds && FutureSeed == Root) { @@ -20979,8 +21111,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, SmallVector<Value *, 16> BuildVectorOpds; SmallVector<int> Mask; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) || - (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) && - isFixedVectorShuffle(BuildVectorOpds, Mask))) + (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) && + isFixedVectorShuffle(BuildVectorOpds, Mask, AC))) return false; if (MaxVFOnly && BuildVectorInsts.size() == 2) { @@ -21198,8 +21330,11 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts, if (R.isDeleted(I)) continue; for (Value *Op : I->operands()) - if (auto *RootOp = dyn_cast<Instruction>(Op)) + if (auto *RootOp = dyn_cast<Instruction>(Op)) { Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R); + if (R.isDeleted(I)) + break; + } } // Try to vectorize operands as vector bundles. for (CmpInst *I : CmpInsts) { @@ -21735,9 +21870,6 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { V2->getValueOperand()->getType()->getScalarSizeInBits()) return false; // UndefValues are compatible with all other values. - if (isa<UndefValue>(V->getValueOperand()) || - isa<UndefValue>(V2->getValueOperand())) - return false; if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand())) if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = @@ -21751,14 +21883,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { "Different nodes should have different DFS numbers"); if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); - InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S.getOpcode()) - return false; return I1->getOpcode() < I2->getOpcode(); } - if (isa<Constant>(V->getValueOperand()) && - isa<Constant>(V2->getValueOperand())) - return false; return V->getValueOperand()->getValueID() < V2->getValueOperand()->getValueID(); }; diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 4b0e12c28f07..ba62c45a4e70 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -325,6 +325,113 @@ void DependencyGraph::createNewNodes(const Interval<Instruction> &NewInterval) { setDefUseUnscheduledSuccs(NewInterval); } +MemDGNode *DependencyGraph::getMemDGNodeBefore(DGNode *N, + bool IncludingN) const { + auto *I = N->getInstruction(); + for (auto *PrevI = IncludingN ? I : I->getPrevNode(); PrevI != nullptr; + PrevI = PrevI->getPrevNode()) { + auto *PrevN = getNodeOrNull(PrevI); + if (PrevN == nullptr) + return nullptr; + if (auto *PrevMemN = dyn_cast<MemDGNode>(PrevN)) + return PrevMemN; + } + return nullptr; +} + +MemDGNode *DependencyGraph::getMemDGNodeAfter(DGNode *N, + bool IncludingN) const { + auto *I = N->getInstruction(); + for (auto *NextI = IncludingN ? I : I->getNextNode(); NextI != nullptr; + NextI = NextI->getNextNode()) { + auto *NextN = getNodeOrNull(NextI); + if (NextN == nullptr) + return nullptr; + if (auto *NextMemN = dyn_cast<MemDGNode>(NextN)) + return NextMemN; + } + return nullptr; +} + +void DependencyGraph::notifyCreateInstr(Instruction *I) { + auto *MemN = dyn_cast<MemDGNode>(getOrCreateNode(I)); + // TODO: Update the dependencies for the new node. + + // Update the MemDGNode chain if this is a memory node. + if (MemN != nullptr) { + if (auto *PrevMemN = getMemDGNodeBefore(MemN, /*IncludingN=*/false)) { + PrevMemN->NextMemN = MemN; + MemN->PrevMemN = PrevMemN; + } + if (auto *NextMemN = getMemDGNodeAfter(MemN, /*IncludingN=*/false)) { + NextMemN->PrevMemN = MemN; + MemN->NextMemN = NextMemN; + } + } +} + +void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) { + // Early return if `I` doesn't actually move. + BasicBlock *BB = To.getNodeParent(); + if (To != BB->end() && &*To == I->getNextNode()) + return; + + // Maintain the DAGInterval. + DAGInterval.notifyMoveInstr(I, To); + + // TODO: Perhaps check if this is legal by checking the dependencies? + + // Update the MemDGNode chain to reflect the instr movement if necessary. + DGNode *N = getNodeOrNull(I); + if (N == nullptr) + return; + MemDGNode *MemN = dyn_cast<MemDGNode>(N); + if (MemN == nullptr) + return; + // First detach it from the existing chain. + MemN->detachFromChain(); + // Now insert it back into the chain at the new location. + if (To != BB->end()) { + DGNode *ToN = getNodeOrNull(&*To); + if (ToN != nullptr) { + MemDGNode *PrevMemN = getMemDGNodeBefore(ToN, /*IncludingN=*/false); + MemDGNode *NextMemN = getMemDGNodeAfter(ToN, /*IncludingN=*/true); + MemN->PrevMemN = PrevMemN; + if (PrevMemN != nullptr) + PrevMemN->NextMemN = MemN; + MemN->NextMemN = NextMemN; + if (NextMemN != nullptr) + NextMemN->PrevMemN = MemN; + } + } else { + // MemN becomes the last instruction in the BB. + auto *TermN = getNodeOrNull(BB->getTerminator()); + if (TermN != nullptr) { + MemDGNode *PrevMemN = getMemDGNodeBefore(TermN, /*IncludingN=*/false); + PrevMemN->NextMemN = MemN; + MemN->PrevMemN = PrevMemN; + } else { + // The terminator is outside the DAG interval so do nothing. + } + } +} + +void DependencyGraph::notifyEraseInstr(Instruction *I) { + // Update the MemDGNode chain if this is a memory node. + if (auto *MemN = dyn_cast_or_null<MemDGNode>(getNodeOrNull(I))) { + auto *PrevMemN = getMemDGNodeBefore(MemN, /*IncludingN=*/false); + auto *NextMemN = getMemDGNodeAfter(MemN, /*IncludingN=*/false); + if (PrevMemN != nullptr) + PrevMemN->NextMemN = NextMemN; + if (NextMemN != nullptr) + NextMemN->PrevMemN = PrevMemN; + } + + InstrToNodeMap.erase(I); + + // TODO: Update the dependencies. +} + Interval<Instruction> DependencyGraph::extend(ArrayRef<Instruction *> Instrs) { if (Instrs.empty()) return {}; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index b801d1863e25..6d02efc05614 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -170,9 +170,7 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() { } void VPBlockBase::setPlan(VPlan *ParentPlan) { - assert( - (ParentPlan->getEntry() == this || ParentPlan->getPreheader() == this) && - "Can only set plan on its entry or preheader block."); + assert(ParentPlan->getEntry() == this && "Can only set plan on its entry."); Plan = ParentPlan; } @@ -207,11 +205,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { return Parent->getEnclosingBlockWithPredecessors(); } -void VPBlockBase::deleteCFG(VPBlockBase *Entry) { - for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Entry))) - delete Block; -} - VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { iterator It = begin(); while (It != end() && It->isPhi()) @@ -222,9 +215,11 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { VPTransformState::VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan) + InnerLoopVectorizer *ILV, VPlan *Plan, + Loop *CurrentParentLoop, Type *CanonicalIVTy) : TTI(TTI), VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan), - LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {} + CurrentParentLoop(CurrentParentLoop), LVer(nullptr), + TypeAnalysis(CanonicalIVTy) {} Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) { if (Def->isLiveIn()) @@ -309,9 +304,8 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { if (!hasScalarValue(Def, LastLane)) { // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and // VPExpandSCEVRecipes can also be uniform. - assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || - isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) || - isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && + assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe, + VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"); IsUniform = true; LastLane = 0; @@ -360,7 +354,7 @@ void VPTransformState::addNewMetadata(Instruction *To, const Instruction *Orig) { // If the loop was versioned with memchecks, add the corresponding no-alias // metadata. - if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) + if (LVer && isa<LoadInst, StoreInst>(Orig)) LVer->annotateInstWithNoAlias(To, Orig); } @@ -476,6 +470,13 @@ void VPIRBasicBlock::execute(VPTransformState *State) { connectToPredecessors(State->CFG); } +VPIRBasicBlock *VPIRBasicBlock::clone() { + auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB); + for (VPRecipeBase &R : Recipes) + NewBlock->appendRecipe(R.clone()); + return NewBlock; +} + void VPBasicBlock::execute(VPTransformState *State) { bool Replica = bool(State->Lane); BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. @@ -502,8 +503,8 @@ void VPBasicBlock::execute(VPTransformState *State) { UnreachableInst *Terminator = State->Builder.CreateUnreachable(); // Register NewBB in its loop. In innermost loops its the same for all // BB's. - if (State->CurrentVectorLoop) - State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI); + if (State->CurrentParentLoop) + State->CurrentParentLoop->addBasicBlockToLoop(NewBB, *State->LI); State->Builder.SetInsertPoint(Terminator); State->CFG.PrevBB = NewBB; @@ -515,14 +516,11 @@ void VPBasicBlock::execute(VPTransformState *State) { executeRecipes(State, NewBB); } -void VPBasicBlock::dropAllReferences(VPValue *NewValue) { - for (VPRecipeBase &R : Recipes) { - for (auto *Def : R.definedValues()) - Def->replaceAllUsesWith(NewValue); - - for (unsigned I = 0, E = R.getNumOperands(); I != E; I++) - R.setOperand(I, NewValue); - } +VPBasicBlock *VPBasicBlock::clone() { + auto *NewBlock = getPlan()->createVPBasicBlock(getName()); + for (VPRecipeBase &R : *this) + NewBlock->appendRecipe(R.clone()); + return NewBlock; } void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) { @@ -543,7 +541,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { SmallVector<VPBlockBase *, 2> Succs(successors()); // Create new empty block after the block to split. - auto *SplitBlock = new VPBasicBlock(getName() + ".split"); + auto *SplitBlock = getPlan()->createVPBasicBlock(getName() + ".split"); VPBlockUtils::insertBlockAfter(SplitBlock, this); // Finally, move the recipes starting at SplitAt to new block. @@ -703,37 +701,30 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) { VPRegionBlock *VPRegionBlock::clone() { const auto &[NewEntry, NewExiting] = cloneFrom(getEntry()); - auto *NewRegion = - new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator()); + auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting, + getName(), isReplicator()); for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry)) Block->setParent(NewRegion); return NewRegion; } -void VPRegionBlock::dropAllReferences(VPValue *NewValue) { - for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) - // Drop all references in VPBasicBlocks and replace all uses with - // DummyValue. - Block->dropAllReferences(NewValue); -} - void VPRegionBlock::execute(VPTransformState *State) { ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(Entry); if (!isReplicator()) { // Create and register the new vector loop. - Loop *PrevLoop = State->CurrentVectorLoop; - State->CurrentVectorLoop = State->LI->AllocateLoop(); + Loop *PrevLoop = State->CurrentParentLoop; + State->CurrentParentLoop = State->LI->AllocateLoop(); BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()]; Loop *ParentLoop = State->LI->getLoopFor(VectorPH); // Insert the new loop into the loop nest and register the new basic blocks // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) - ParentLoop->addChildLoop(State->CurrentVectorLoop); + ParentLoop->addChildLoop(State->CurrentParentLoop); else - State->LI->addTopLevelLoop(State->CurrentVectorLoop); + State->LI->addTopLevelLoop(State->CurrentParentLoop); // Visit the VPBlocks connected to "this", starting from it. for (VPBlockBase *Block : RPOT) { @@ -741,7 +732,7 @@ void VPRegionBlock::execute(VPTransformState *State) { Block->execute(State); } - State->CurrentVectorLoop = PrevLoop; + State->CurrentParentLoop = PrevLoop; return; } @@ -823,16 +814,27 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, } #endif -VPlan::~VPlan() { - if (Entry) { - VPValue DummyValue; - for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) - Block->dropAllReferences(&DummyValue); - - VPBlockBase::deleteCFG(Entry); +VPlan::VPlan(Loop *L) { + setEntry(createVPIRBasicBlock(L->getLoopPreheader())); + ScalarHeader = createVPIRBasicBlock(L->getHeader()); +} - Preheader->dropAllReferences(&DummyValue); - delete Preheader; +VPlan::~VPlan() { + VPValue DummyValue; + + for (auto *VPB : CreatedBlocks) { + if (auto *VPBB = dyn_cast<VPBasicBlock>(VPB)) { + // Replace all operands of recipes and all VPValues defined in VPBB with + // DummyValue so the block can be deleted. + for (VPRecipeBase &R : *VPBB) { + for (auto *Def : R.definedValues()) + Def->replaceAllUsesWith(&DummyValue); + + for (unsigned I = 0, E = R.getNumOperands(); I != E; I++) + R.setOperand(I, &DummyValue); + } + } + delete VPB; } for (VPValue *VPV : VPLiveInsToFree) delete VPV; @@ -840,34 +842,27 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } -VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) { - auto *VPIRBB = new VPIRBasicBlock(IRBB); - for (Instruction &I : - make_range(IRBB->begin(), IRBB->getTerminator()->getIterator())) - VPIRBB->appendRecipe(new VPIRInstruction(I)); - return VPIRBB; -} - VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop) { - VPIRBasicBlock *Entry = - VPIRBasicBlock::fromBasicBlock(TheLoop->getLoopPreheader()); - VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); - VPIRBasicBlock *ScalarHeader = - VPIRBasicBlock::fromBasicBlock(TheLoop->getHeader()); - auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader); + auto Plan = std::make_unique<VPlan>(TheLoop); + VPBlockBase *ScalarHeader = Plan->getScalarHeader(); + + // Connect entry only to vector preheader initially. Entry will also be + // connected to the scalar preheader later, during skeleton creation when + // runtime guards are added as needed. Note that when executing the VPlan for + // an epilogue vector loop, the original entry block here will be replaced by + // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after + // generating code for the main vector loop. + VPBasicBlock *VecPreheader = Plan->createVPBasicBlock("vector.ph"); + VPBlockUtils::connectBlocks(Plan->getEntry(), VecPreheader); // Create SCEV and VPValue for the trip count. - - // Currently only loops with countable exits are vectorized, but calling - // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with - // uncountable exits whilst also ensuring the symbolic maximum and known - // back-edge taken count remain identical for loops with countable exits. + // We use the symbolic max backedge-taken-count, which works also when + // vectorizing loops with uncountable early exits. const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount(); - assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) && - BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) && + assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) && "Invalid loop count"); ScalarEvolution &SE = *PSE.getSE(); const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV, @@ -877,17 +872,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // Create VPRegionBlock, with empty header and latch blocks, to be filled // during processing later. - VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); - VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); + VPBasicBlock *HeaderVPBB = Plan->createVPBasicBlock("vector.body"); + VPBasicBlock *LatchVPBB = Plan->createVPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); - auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop", - false /*isReplicator*/); + auto *TopRegion = Plan->createVPRegionBlock( + HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/); VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader); - VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); + VPBasicBlock *MiddleVPBB = Plan->createVPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); - VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); + VPBasicBlock *ScalarPH = Plan->createVPBasicBlock("scalar.ph"); VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader); if (!RequiresScalarEpilogueCheck) { VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -902,8 +897,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // 2) If we require a scalar epilogue, there is no conditional branch as // we unconditionally branch to the scalar preheader. Do nothing. // 3) Otherwise, construct a runtime check. - BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock(); - auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); + BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock(); + auto *VPExitBlock = Plan->createVPIRBasicBlock(IRExitBlock); // The connection order corresponds to the operands of the conditional branch. VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -927,7 +922,6 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, } void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, - Value *CanonicalIVStartValue, VPTransformState &State) { Type *TCTy = TripCountV->getType(); // Check if the backedge taken count is needed, and if so build it. @@ -953,41 +947,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, } else { VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF)); } - - // When vectorizing the epilogue loop, the canonical induction start value - // needs to be changed from zero to the value after the main vector loop. - // FIXME: Improve modeling for canonical IV start values in the epilogue loop. - if (CanonicalIVStartValue) { - VPValue *VPV = getOrAddLiveIn(CanonicalIVStartValue); - auto *IV = getCanonicalIV(); - assert(all_of(IV->users(), - [](const VPUser *U) { - return isa<VPScalarIVStepsRecipe>(U) || - isa<VPScalarCastRecipe>(U) || - isa<VPDerivedIVRecipe>(U) || - cast<VPInstruction>(U)->getOpcode() == - Instruction::Add; - }) && - "the canonical IV should only be used by its increment or " - "ScalarIVSteps when resetting the start value"); - IV->setOperand(0, VPV); - } -} - -/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p -/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must -/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All -/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. -static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { - VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB); - for (auto &R : make_early_inc_range(*VPBB)) { - assert(!R.isPhi() && "Tried to move phi recipe to end of block"); - R.moveBefore(*IRVPBB, IRVPBB->end()); - } - - VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); - - delete VPBB; } /// Generate the code inside the preheader and body of the vectorized loop. @@ -997,27 +956,23 @@ void VPlan::execute(VPTransformState *State) { // Initialize CFG state. State->CFG.PrevVPBB = nullptr; State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor(); - BasicBlock *VectorPreHeader = State->CFG.PrevBB; - State->Builder.SetInsertPoint(VectorPreHeader->getTerminator()); // Disconnect VectorPreHeader from ExitBB in both the CFG and DT. + BasicBlock *VectorPreHeader = State->CFG.PrevBB; cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr); State->CFG.DTU.applyUpdates( {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}}); - // Replace regular VPBB's for the middle and scalar preheader blocks with - // VPIRBasicBlocks wrapping their IR blocks. The IR blocks are created during - // skeleton creation, so we can only create the VPIRBasicBlocks now during - // VPlan execution rather than earlier during VPlan construction. - BasicBlock *MiddleBB = State->CFG.ExitBB; - VPBasicBlock *MiddleVPBB = getMiddleBlock(); - BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); - replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh); - replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB); + LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF + << ", UF=" << getUF() << '\n'); + setName("Final VPlan"); + LLVM_DEBUG(dump()); // Disconnect the middle block from its single successor (the scalar loop // header) in both the CFG and DT. The branch will be recreated during VPlan // execution. + BasicBlock *MiddleBB = State->CFG.ExitBB; + BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); auto *BrInst = new UnreachableInst(MiddleBB->getContext()); BrInst->insertBefore(MiddleBB->getTerminator()); MiddleBB->getTerminator()->eraseFromParent(); @@ -1028,8 +983,11 @@ void VPlan::execute(VPTransformState *State) { State->CFG.DTU.applyUpdates( {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}}); - // Generate code in the loop pre-header and body. - for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) + ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( + Entry); + // Generate code for the VPlan, in parts of the vector skeleton, loop body and + // successor blocks including the middle, exit and scalar preheader blocks. + for (VPBlockBase *Block : RPOT) Block->execute(State); VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); @@ -1043,8 +1001,7 @@ void VPlan::execute(VPTransformState *State) { if (isa<VPWidenPHIRecipe>(&R)) continue; - if (isa<VPWidenPointerInductionRecipe>(&R) || - isa<VPWidenIntOrFpInductionRecipe>(&R)) { + if (isa<VPWidenInductionRecipe>(&R)) { PHINode *Phi = nullptr; if (isa<VPWidenIntOrFpInductionRecipe>(&R)) { Phi = cast<PHINode>(State->get(R.getVPSingleValue())); @@ -1079,9 +1036,6 @@ void VPlan::execute(VPTransformState *State) { } State->CFG.DTU.flush(); - assert(State->CFG.DTU.getDomTree().verify( - DominatorTree::VerificationLevel::Fast) && - "DT not preserved correctly"); } InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { @@ -1090,6 +1044,21 @@ InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { return getVectorLoopRegion()->cost(VF, Ctx); } +VPRegionBlock *VPlan::getVectorLoopRegion() { + // TODO: Cache if possible. + for (VPBlockBase *B : vp_depth_first_shallow(getEntry())) + if (auto *R = dyn_cast<VPRegionBlock>(B)) + return R; + return nullptr; +} + +const VPRegionBlock *VPlan::getVectorLoopRegion() const { + for (const VPBlockBase *B : vp_depth_first_shallow(getEntry())) + if (auto *R = dyn_cast<VPRegionBlock>(B)) + return R; + return nullptr; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPlan::printLiveIns(raw_ostream &O) const { VPSlotTracker SlotTracker(this); @@ -1134,12 +1103,9 @@ void VPlan::print(raw_ostream &O) const { printLiveIns(O); - if (!getPreheader()->empty()) { - O << "\n"; - getPreheader()->print(O, "", SlotTracker); - } - - for (const VPBlockBase *Block : vp_depth_first_shallow(getEntry())) { + ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<const VPBlockBase *>> + RPOT(getEntry()); + for (const VPBlockBase *Block : RPOT) { O << '\n'; Block->print(O, "", SlotTracker); } @@ -1219,8 +1185,8 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry, } VPlan *VPlan::duplicate() { + unsigned NumBlocksBeforeCloning = CreatedBlocks.size(); // Clone blocks. - VPBasicBlock *NewPreheader = Preheader->clone(); const auto &[NewEntry, __] = cloneFrom(Entry); BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock(); @@ -1230,8 +1196,7 @@ VPlan *VPlan::duplicate() { return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB; })); // Create VPlan, clone live-ins and remap operands in the cloned blocks. - auto *NewPlan = - new VPlan(NewPreheader, cast<VPBasicBlock>(NewEntry), NewScalarHeader); + auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader); DenseMap<VPValue *, VPValue *> Old2NewVPValues; for (VPValue *OldLiveIn : VPLiveInsToFree) { Old2NewVPValues[OldLiveIn] = @@ -1251,7 +1216,6 @@ VPlan *VPlan::duplicate() { // else NewTripCount will be created and inserted into Old2NewVPValues when // TripCount is cloned. In any case NewPlan->TripCount is updated below. - remapOperands(Preheader, NewPreheader, Old2NewVPValues); remapOperands(Entry, NewEntry, Old2NewVPValues); // Initialize remaining fields of cloned VPlan. @@ -1262,9 +1226,32 @@ VPlan *VPlan::duplicate() { assert(Old2NewVPValues.contains(TripCount) && "TripCount must have been added to Old2NewVPValues"); NewPlan->TripCount = Old2NewVPValues[TripCount]; + + // Transfer all cloned blocks (the second half of all current blocks) from + // current to new VPlan. + unsigned NumBlocksAfterCloning = CreatedBlocks.size(); + for (unsigned I : + seq<unsigned>(NumBlocksBeforeCloning, NumBlocksAfterCloning)) + NewPlan->CreatedBlocks.push_back(this->CreatedBlocks[I]); + CreatedBlocks.truncate(NumBlocksBeforeCloning); + return NewPlan; } +VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) { + auto *VPIRBB = new VPIRBasicBlock(IRBB); + CreatedBlocks.push_back(VPIRBB); + return VPIRBB; +} + +VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) { + auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB); + for (Instruction &I : + make_range(IRBB->begin(), IRBB->getTerminator()->getIterator())) + VPIRBB->appendRecipe(new VPIRInstruction(I)); + return VPIRBB; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) Twine VPlanPrinter::getUID(const VPBlockBase *Block) { @@ -1303,8 +1290,6 @@ void VPlanPrinter::dump() { OS << "edge [fontname=Courier, fontsize=30]\n"; OS << "compound=true\n"; - dumpBlock(Plan.getPreheader()); - for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry())) dumpBlock(Block); @@ -1565,7 +1550,6 @@ void VPSlotTracker::assignNames(const VPlan &Plan) { assignName(Plan.BackedgeTakenCount); for (VPValue *LI : Plan.VPLiveInsToFree) assignName(LI); - assignNames(Plan.getPreheader()); ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>> RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry())); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e1d828f038f9..88f3f672d3aa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -236,7 +236,8 @@ public: struct VPTransformState { VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan); + InnerLoopVectorizer *ILV, VPlan *Plan, + Loop *CurrentParentLoop, Type *CanonicalIVTy); /// Target Transform Info. const TargetTransformInfo *TTI; @@ -373,8 +374,8 @@ struct VPTransformState { /// Pointer to the VPlan code is generated for. VPlan *Plan; - /// The loop object for the current parent region, or nullptr. - Loop *CurrentVectorLoop = nullptr; + /// The parent loop object for the current scope, or nullptr. + Loop *CurrentParentLoop = nullptr; /// LoopVersioning. It's only set up (non-null) if memchecks were /// used. @@ -621,6 +622,14 @@ public: /// Remove all the successors of this block. void clearSuccessors() { Successors.clear(); } + /// Swap successors of the block. The block must have exactly 2 successors. + // TODO: This should be part of introducing conditional branch recipes rather + // than being independent. + void swapSuccessors() { + assert(Successors.size() == 2 && "must have 2 successors to swap"); + std::swap(Successors[0], Successors[1]); + } + /// The method which generates the output IR that correspond to this /// VPBlockBase, thereby "executing" the VPlan. virtual void execute(VPTransformState *State) = 0; @@ -628,9 +637,6 @@ public: /// Return the cost of the block. virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0; - /// Delete all blocks reachable from a given VPBlockBase, inclusive. - static void deleteCFG(VPBlockBase *Entry); - /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { // There are currently no constraints that prevent an instruction to be @@ -638,10 +644,6 @@ public: return true; } - /// Replace all operands of VPUsers in the block with \p NewValue and also - /// replaces all uses of VPValues defined in the block with NewValue. - virtual void dropAllReferences(VPValue *NewValue) = 0; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printAsOperand(raw_ostream &OS, bool PrintType = false) const { OS << getName(); @@ -944,11 +946,6 @@ public: DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {} }; - struct GEPFlagsTy { - char IsInBounds : 1; - GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {} - }; - private: struct ExactFlagsTy { char IsExact : 1; @@ -975,7 +972,7 @@ private: WrapFlagsTy WrapFlags; DisjointFlagsTy DisjointFlags; ExactFlagsTy ExactFlags; - GEPFlagsTy GEPFlags; + GEPNoWrapFlags GEPFlags; NonNegFlagsTy NonNegFlags; FastMathFlagsTy FMFs; unsigned AllFlags; @@ -1012,7 +1009,7 @@ public: ExactFlags.IsExact = Op->isExact(); } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { OpType = OperationType::GEPOp; - GEPFlags.IsInBounds = GEP->isInBounds(); + GEPFlags = GEP->getNoWrapFlags(); } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) { OpType = OperationType::NonNegOp; NonNegFlags.NonNeg = PNNI->hasNonNeg(); @@ -1052,7 +1049,7 @@ public: protected: template <typename IterT> VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, - GEPFlagsTy GEPFlags, DebugLoc DL = {}) + GEPNoWrapFlags GEPFlags, DebugLoc DL = {}) : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {} @@ -1089,7 +1086,7 @@ public: ExactFlags.IsExact = false; break; case OperationType::GEPOp: - GEPFlags.IsInBounds = false; + GEPFlags = GEPNoWrapFlags::none(); break; case OperationType::FPMathOp: FMFs.NoNaNs = false; @@ -1118,10 +1115,7 @@ public: I->setIsExact(ExactFlags.IsExact); break; case OperationType::GEPOp: - // TODO(gep_nowrap): Track the full GEPNoWrapFlags in VPlan. - cast<GetElementPtrInst>(I)->setNoWrapFlags( - GEPFlags.IsInBounds ? GEPNoWrapFlags::inBounds() - : GEPNoWrapFlags::none()); + cast<GetElementPtrInst>(I)->setNoWrapFlags(GEPFlags); break; case OperationType::FPMathOp: I->setHasAllowReassoc(FMFs.AllowReassoc); @@ -1147,11 +1141,7 @@ public: return CmpPredicate; } - bool isInBounds() const { - assert(OpType == OperationType::GEPOp && - "recipe doesn't have inbounds flag"); - return GEPFlags.IsInBounds; - } + GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; } /// Returns true if the recipe has fast-math flags. bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; } @@ -1232,6 +1222,9 @@ public: // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, + // Returns a scalar boolean value, which is true if any lane of its single + // operand is true. + AnyOf, }; private: @@ -1295,7 +1288,7 @@ public: assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint"); } - VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags, + VPInstruction(VPValue *Ptr, VPValue *Offset, GEPNoWrapFlags Flags, DebugLoc DL = {}, const Twine &Name = "") : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({Ptr, Offset}), Flags, DL), @@ -1336,14 +1329,6 @@ public: LLVM_DUMP_METHOD void dump() const; #endif - /// Return true if this instruction may modify memory. - bool mayWriteToMemory() const { - // TODO: we can use attributes of the called function to rule out memory - // modifications. - return Opcode == Instruction::Store || Opcode == Instruction::Call || - Opcode == Instruction::Invoke || Opcode == SLPStore; - } - bool hasResult() const { // CallInst may or may not have a result, depending on the called function. // Conservatively return calls have results for now. @@ -1662,7 +1647,7 @@ public: VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, ArrayRef<VPValue *> CallArguments, Type *Ty, DebugLoc DL = {}) - : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments), + : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) { LLVMContext &Ctx = Ty->getContext(); AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID); @@ -1697,6 +1682,9 @@ public: InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override; + /// Return the ID of the intrinsic. + Intrinsic::ID getVectorIntrinsicID() const { return VectorIntrinsicID; } + /// Return the scalar return type of the intrinsic. Type *getResultType() const { return ResultTy; } @@ -1911,10 +1899,9 @@ class VPReverseVectorPointerRecipe : public VPRecipeWithIRFlags, public: VPReverseVectorPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy, - bool IsInBounds, DebugLoc DL) + GEPNoWrapFlags GEPFlags, DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPReverseVectorPointerSC, - ArrayRef<VPValue *>({Ptr, VF}), - GEPFlagsTy(IsInBounds), DL), + ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL), IndexedTy(IndexedTy) {} VP_CLASSOF_IMPL(VPDef::VPReverseVectorPointerSC) @@ -1946,8 +1933,9 @@ public: } VPReverseVectorPointerRecipe *clone() override { - return new VPReverseVectorPointerRecipe( - getOperand(0), getVFValue(), IndexedTy, isInBounds(), getDebugLoc()); + return new VPReverseVectorPointerRecipe(getOperand(0), getVFValue(), + IndexedTy, getGEPNoWrapFlags(), + getDebugLoc()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1963,10 +1951,10 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, Type *IndexedTy; public: - VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsInBounds, + VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags, DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr), - GEPFlagsTy(IsInBounds), DL), + GEPFlags, DL), IndexedTy(IndexedTy) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) @@ -1988,8 +1976,8 @@ public: } VPVectorPointerRecipe *clone() override { - return new VPVectorPointerRecipe(getOperand(0), IndexedTy, isInBounds(), - getDebugLoc()); + return new VPVectorPointerRecipe(getOperand(0), IndexedTy, + getGEPNoWrapFlags(), getDebugLoc()); } /// Return the cost of this VPHeaderPHIRecipe. @@ -2088,28 +2076,72 @@ public: } }; +/// Base class for widened induction (VPWidenIntOrFpInductionRecipe and +/// VPWidenPointerInductionRecipe), providing shared functionality, including +/// retrieving the step value, induction descriptor and original phi node. +class VPWidenInductionRecipe : public VPHeaderPHIRecipe { + const InductionDescriptor &IndDesc; + +public: + VPWidenInductionRecipe(unsigned char Kind, PHINode *IV, VPValue *Start, + VPValue *Step, const InductionDescriptor &IndDesc, + DebugLoc DL) + : VPHeaderPHIRecipe(Kind, IV, Start, DL), IndDesc(IndDesc) { + addOperand(Step); + } + + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPDef::VPWidenIntOrFpInductionSC || + R->getVPDefID() == VPDef::VPWidenPointerInductionSC; + } + + virtual void execute(VPTransformState &State) override = 0; + + /// Returns the step value of the induction. + VPValue *getStepValue() { return getOperand(1); } + const VPValue *getStepValue() const { return getOperand(1); } + + PHINode *getPHINode() const { return cast<PHINode>(getUnderlyingValue()); } + + /// Returns the induction descriptor for the recipe. + const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } + + VPValue *getBackedgeValue() override { + // TODO: All operands of base recipe must exist and be at same index in + // derived recipe. + llvm_unreachable( + "VPWidenIntOrFpInductionRecipe generates its own backedge value"); + } + + VPRecipeBase &getBackedgeRecipe() override { + // TODO: All operands of base recipe must exist and be at same index in + // derived recipe. + llvm_unreachable( + "VPWidenIntOrFpInductionRecipe generates its own backedge value"); + } +}; + /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector values. -class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { - PHINode *IV; +class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe { TruncInst *Trunc; - const InductionDescriptor &IndDesc; public: VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, - VPValue *VF, const InductionDescriptor &IndDesc) - : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV), - Trunc(nullptr), IndDesc(IndDesc) { - addOperand(Step); + VPValue *VF, const InductionDescriptor &IndDesc, + DebugLoc DL) + : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start, + Step, IndDesc, DL), + Trunc(nullptr) { addOperand(VF); } VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, VPValue *VF, const InductionDescriptor &IndDesc, - TruncInst *Trunc) - : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start), - IV(IV), Trunc(Trunc), IndDesc(IndDesc) { - addOperand(Step); + TruncInst *Trunc, DebugLoc DL) + : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start, + Step, IndDesc, DL), + Trunc(Trunc) { addOperand(VF); } @@ -2117,7 +2149,8 @@ public: VPWidenIntOrFpInductionRecipe *clone() override { return new VPWidenIntOrFpInductionRecipe( - IV, getStartValue(), getStepValue(), getVFValue(), IndDesc, Trunc); + getPHINode(), getStartValue(), getStepValue(), getVFValue(), + getInductionDescriptor(), Trunc, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC) @@ -2132,24 +2165,6 @@ public: VPSlotTracker &SlotTracker) const override; #endif - VPValue *getBackedgeValue() override { - // TODO: All operands of base recipe must exist and be at same index in - // derived recipe. - llvm_unreachable( - "VPWidenIntOrFpInductionRecipe generates its own backedge value"); - } - - VPRecipeBase &getBackedgeRecipe() override { - // TODO: All operands of base recipe must exist and be at same index in - // derived recipe. - llvm_unreachable( - "VPWidenIntOrFpInductionRecipe generates its own backedge value"); - } - - /// Returns the step value of the induction. - VPValue *getStepValue() { return getOperand(1); } - const VPValue *getStepValue() const { return getOperand(1); } - VPValue *getVFValue() { return getOperand(2); } const VPValue *getVFValue() const { return getOperand(2); } @@ -2164,11 +2179,6 @@ public: TruncInst *getTruncInst() { return Trunc; } const TruncInst *getTruncInst() const { return Trunc; } - PHINode *getPHINode() { return IV; } - - /// Returns the induction descriptor for the recipe. - const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } - /// Returns true if the induction is canonical, i.e. starting at 0 and /// incremented by UF * VF (= the original IV is incremented by 1) and has the /// same type as the canonical induction. @@ -2176,7 +2186,7 @@ public: /// Returns the scalar type of the induction. Type *getScalarType() const { - return Trunc ? Trunc->getType() : IV->getType(); + return Trunc ? Trunc->getType() : getPHINode()->getType(); } /// Returns the VPValue representing the value of this induction at @@ -2187,10 +2197,8 @@ public: } }; -class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe, +class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe, public VPUnrollPartAccessor<3> { - const InductionDescriptor &IndDesc; - bool IsScalarAfterVectorization; public: @@ -2198,20 +2206,17 @@ public: /// Start. VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - bool IsScalarAfterVectorization) - : VPHeaderPHIRecipe(VPDef::VPWidenPointerInductionSC, Phi), - IndDesc(IndDesc), - IsScalarAfterVectorization(IsScalarAfterVectorization) { - addOperand(Start); - addOperand(Step); - } + bool IsScalarAfterVectorization, DebugLoc DL) + : VPWidenInductionRecipe(VPDef::VPWidenPointerInductionSC, Phi, Start, + Step, IndDesc, DL), + IsScalarAfterVectorization(IsScalarAfterVectorization) {} ~VPWidenPointerInductionRecipe() override = default; VPWidenPointerInductionRecipe *clone() override { return new VPWidenPointerInductionRecipe( cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1), - IndDesc, IsScalarAfterVectorization); + getInductionDescriptor(), IsScalarAfterVectorization, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC) @@ -2222,9 +2227,6 @@ public: /// Returns true if only scalar values will be generated. bool onlyScalarsGenerated(bool IsScalable); - /// Returns the induction descriptor for the recipe. - const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } - /// Returns the VPValue representing the value of this induction at /// the first unrolled part, if it exists. Returns itself if unrolling did not /// take place. @@ -2589,8 +2591,9 @@ class VPReductionRecipe : public VPSingleDefRecipe { protected: VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R, Instruction *I, ArrayRef<VPValue *> Operands, - VPValue *CondOp, bool IsOrdered) - : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) { + VPValue *CondOp, bool IsOrdered, DebugLoc DL) + : VPSingleDefRecipe(SC, Operands, I, DL), RdxDesc(R), + IsOrdered(IsOrdered) { if (CondOp) { IsConditional = true; addOperand(CondOp); @@ -2600,16 +2603,17 @@ protected: public: VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered) + bool IsOrdered, DebugLoc DL = {}) : VPReductionRecipe(VPDef::VPReductionSC, R, I, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, - IsOrdered) {} + IsOrdered, DL) {} ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(), - getVecOp(), getCondOp(), IsOrdered); + getVecOp(), getCondOp(), IsOrdered, + getDebugLoc()); } static inline bool classof(const VPRecipeBase *R) { @@ -2664,7 +2668,7 @@ public: VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(), cast_or_null<Instruction>(R.getUnderlyingValue()), ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, - R.isOrdered()) {} + R.isOrdered(), R.getDebugLoc()) {} ~VPReductionEVLRecipe() override = default; @@ -2834,12 +2838,12 @@ class VPPredInstPHIRecipe : public VPSingleDefRecipe { public: /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi /// nodes after merging back from a Branch-on-Mask. - VPPredInstPHIRecipe(VPValue *PredV) - : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV) {} + VPPredInstPHIRecipe(VPValue *PredV, DebugLoc DL) + : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV, DL) {} ~VPPredInstPHIRecipe() override = default; VPPredInstPHIRecipe *clone() override { - return new VPPredInstPHIRecipe(getOperand(0)); + return new VPPredInstPHIRecipe(getOperand(0), getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC) @@ -3203,11 +3207,6 @@ public: return true; } - /// Check if the induction described by \p Kind, /p Start and \p Step is - /// canonical, i.e. has the same start and step (of 1) as the canonical IV. - bool isCanonical(InductionDescriptor::InductionKind Kind, VPValue *Start, - VPValue *Step) const; - /// Return the cost of this VPCanonicalIVPHIRecipe. InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override { @@ -3551,8 +3550,6 @@ public: return make_range(begin(), getFirstNonPhi()); } - void dropAllReferences(VPValue *NewValue) override; - /// Split current block at \p SplitAt by inserting a new block between the /// current block and its successors and moving all recipes starting at /// SplitAt to the new block. Returns the new block. @@ -3582,12 +3579,7 @@ public: /// Clone the current block and it's recipes, without updating the operands of /// the cloned recipes. - VPBasicBlock *clone() override { - auto *NewBlock = new VPBasicBlock(getName()); - for (VPRecipeBase &R : *this) - NewBlock->appendRecipe(R.clone()); - return NewBlock; - } + VPBasicBlock *clone() override; protected: /// Execute the recipes in the IR basic block \p BB. @@ -3623,20 +3615,11 @@ public: return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; } - /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all - /// instructions in \p IRBB, except its terminator which is managed in VPlan. - static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB); - /// The method which generates the output IR instructions that correspond to /// this VPBasicBlock, thereby "executing" the VPlan. void execute(VPTransformState *State) override; - VPIRBasicBlock *clone() override { - auto *NewBlock = new VPIRBasicBlock(IRBB); - for (VPRecipeBase &R : Recipes) - NewBlock->appendRecipe(R.clone()); - return NewBlock; - } + VPIRBasicBlock *clone() override; BasicBlock *getIRBasicBlock() const { return IRBB; } }; @@ -3675,13 +3658,7 @@ public: : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr), IsReplicator(IsReplicator) {} - ~VPRegionBlock() override { - if (Entry) { - VPValue DummyValue; - Entry->dropAllReferences(&DummyValue); - deleteCFG(Entry); - } - } + ~VPRegionBlock() override {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPBlockBase *V) { @@ -3729,8 +3706,6 @@ public: // Return the cost of this region. InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; - void dropAllReferences(VPValue *NewValue) override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using @@ -3757,14 +3732,12 @@ class VPlan { friend class VPlanPrinter; friend class VPSlotTracker; - /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the - /// preheader of the vector loop. - VPBasicBlock *Entry; - /// VPBasicBlock corresponding to the original preheader. Used to place /// VPExpandSCEV recipes for expressions used during skeleton creation and the /// rest of VPlan execution. - VPBasicBlock *Preheader; + /// When this VPlan is used for the epilogue vector loop, the entry will be + /// replaced by a new entry block created during skeleton creation. + VPBasicBlock *Entry; /// VPIRBasicBlock wrapping the header of the original scalar loop. VPIRBasicBlock *ScalarHeader; @@ -3809,46 +3782,50 @@ class VPlan { /// been modeled in VPlan directly. DenseMap<const SCEV *, VPValue *> SCEVToExpansion; -public: - /// Construct a VPlan with original preheader \p Preheader, trip count \p TC, - /// \p Entry to the plan and with \p ScalarHeader wrapping the original header - /// of the scalar loop. At the moment, \p Preheader and \p Entry need to be - /// disconnected, as the bypass blocks between them are not yet modeled in - /// VPlan. - VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry, - VPIRBasicBlock *ScalarHeader) - : VPlan(Preheader, Entry, ScalarHeader) { - TripCount = TC; - } + /// Blocks allocated and owned by the VPlan. They will be deleted once the + /// VPlan is destroyed. + SmallVector<VPBlockBase *> CreatedBlocks; - /// Construct a VPlan with original preheader \p Preheader, \p Entry to - /// the plan and with \p ScalarHeader wrapping the original header of the - /// scalar loop. At the moment, \p Preheader and \p Entry need to be - /// disconnected, as the bypass blocks between them are not yet modeled in - /// VPlan. - VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry, - VPIRBasicBlock *ScalarHeader) - : Entry(Entry), Preheader(Preheader), ScalarHeader(ScalarHeader) { + /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader + /// wrapping the original header of the scalar loop. + VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader) + : Entry(Entry), ScalarHeader(ScalarHeader) { Entry->setPlan(this); - Preheader->setPlan(this); - assert(Preheader->getNumSuccessors() == 0 && - Preheader->getNumPredecessors() == 0 && - "preheader must be disconnected"); assert(ScalarHeader->getNumSuccessors() == 0 && "scalar header must be a leaf node"); } +public: + /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the + /// original preheader and scalar header of \p L, to be used as entry and + /// scalar header blocks of the new VPlan. + VPlan(Loop *L); + + /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock + /// wrapping \p ScalarHeaderBB and a trip count of \p TC. + VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) { + setEntry(createVPBasicBlock("preheader")); + ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB); + TripCount = TC; + } + ~VPlan(); + void setEntry(VPBasicBlock *VPBB) { + Entry = VPBB; + VPBB->setPlan(this); + } + /// Create initial VPlan, having an "entry" VPBasicBlock (wrapping - /// original scalar pre-header ) which contains SCEV expansions that need - /// to happen before the CFG is modified; a VPBasicBlock for the vector - /// pre-header, followed by a region for the vector loop, followed by the - /// middle VPBasicBlock. If a check is needed to guard executing the scalar - /// epilogue loop, it will be added to the middle block, together with - /// VPBasicBlocks for the scalar preheader and exit blocks. - /// \p InductionTy is the type of the canonical induction and used for related - /// values, like the trip count expression. + /// original scalar pre-header) which contains SCEV expansions that need + /// to happen before the CFG is modified (when executing a VPlan for the + /// epilogue vector loop, the original entry needs to be replaced by a new + /// one); a VPBasicBlock for the vector pre-header, followed by a region for + /// the vector loop, followed by the middle VPBasicBlock. If a check is needed + /// to guard executing the scalar epilogue loop, it will be added to the + /// middle block, together with VPBasicBlocks for the scalar preheader and + /// exit blocks. \p InductionTy is the type of the canonical induction and + /// used for related values, like the trip count expression. static VPlanPtr createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, @@ -3856,7 +3833,7 @@ public: /// Prepare the plan for execution, setting up the required live-in values. void prepareToExecute(Value *TripCount, Value *VectorTripCount, - Value *CanonicalIVStartValue, VPTransformState &State); + VPTransformState &State); /// Generate the IR code for this VPlan. void execute(VPTransformState *State); @@ -3873,26 +3850,22 @@ public: } /// Returns the VPRegionBlock of the vector loop. - VPRegionBlock *getVectorLoopRegion() { - return cast<VPRegionBlock>(getEntry()->getSingleSuccessor()); - } - const VPRegionBlock *getVectorLoopRegion() const { - return cast<VPRegionBlock>(getEntry()->getSingleSuccessor()); - } + VPRegionBlock *getVectorLoopRegion(); + const VPRegionBlock *getVectorLoopRegion() const; /// Returns the 'middle' block of the plan, that is the block that selects /// whether to execute the scalar tail loop or the exit block from the loop /// latch. const VPBasicBlock *getMiddleBlock() const { - return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor()); + return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front()); } VPBasicBlock *getMiddleBlock() { - return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor()); + return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front()); } /// Return the VPBasicBlock for the preheader of the scalar loop. VPBasicBlock *getScalarPreheader() const { - return cast<VPBasicBlock>(ScalarHeader->getSinglePredecessor()); + return cast<VPBasicBlock>(getScalarHeader()->getSinglePredecessor()); } /// Return the VPIRBasicBlock wrapping the header of the scalar loop. @@ -4027,13 +4000,52 @@ public: SCEVToExpansion[S] = V; } - /// \return The block corresponding to the original preheader. - VPBasicBlock *getPreheader() { return Preheader; } - const VPBasicBlock *getPreheader() const { return Preheader; } - /// Clone the current VPlan, update all VPValues of the new VPlan and cloned /// recipes to refer to the clones, and return it. VPlan *duplicate(); + + /// Create a new VPBasicBlock with \p Name and containing \p Recipe if + /// present. The returned block is owned by the VPlan and deleted once the + /// VPlan is destroyed. + VPBasicBlock *createVPBasicBlock(const Twine &Name, + VPRecipeBase *Recipe = nullptr) { + auto *VPB = new VPBasicBlock(Name, Recipe); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p + /// IsReplicator is true, the region is a replicate region. The returned block + /// is owned by the VPlan and deleted once the VPlan is destroyed. + VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, + const std::string &Name = "", + bool IsReplicator = false) { + auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set + /// to nullptr. If \p IsReplicator is true, the region is a replicate region. + /// The returned block is owned by the VPlan and deleted once the VPlan is + /// destroyed. + VPRegionBlock *createVPRegionBlock(const std::string &Name = "", + bool IsReplicator = false) { + auto *VPB = new VPRegionBlock(Name, IsReplicator); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a VPIRBasicBlock wrapping \p IRBB, but do not create + /// VPIRInstructions wrapping the instructions in t\p IRBB. The returned + /// block is owned by the VPlan and deleted once the VPlan is destroyed. + VPIRBasicBlock *createEmptyVPIRBasicBlock(BasicBlock *IRBB); + + /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all + /// instructions in \p IRBB, except its terminator which is managed by the + /// successors of the block in VPlan. The returned block is owned by the VPlan + /// and deleted once the VPlan is destroyed. + VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB); }; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -4179,8 +4191,6 @@ public: "Can't connect two block with different parents"); assert((SuccIdx != -1u || From->getNumSuccessors() < 2) && "Blocks can't have more than two successors."); - assert((PredIdx != -1u || To->getNumPredecessors() < 2) && - "Blocks can't have more than two predecessors."); if (SuccIdx == -1u) From->appendSuccessor(To); else diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 969d07b229e4..35497a7431f7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -61,10 +61,16 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case Instruction::ICmp: case VPInstruction::ActiveLaneMask: return inferScalarType(R->getOperand(1)); + case VPInstruction::ComputeReductionResult: { + auto *PhiR = cast<VPReductionPHIRecipe>(R->getOperand(0)); + auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); + return OrigPhi->getType(); + } case VPInstruction::ExplicitVectorLength: return Type::getIntNTy(Ctx, 32); case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::Not: + case VPInstruction::ResumePhi: return SetResultTyFromOp(); case VPInstruction::ExtractFromEnd: { Type *BaseTy = inferScalarType(R->getOperand(0)); @@ -127,7 +133,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenLoadEVLRecipe>(R)) && + assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) && "Store recipes should not define any values"); return cast<LoadInst>(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 6e633739fcc3..76ed578424df 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -182,7 +182,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { // Create new VPBB. StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName(); LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n"); - VPBasicBlock *VPBB = new VPBasicBlock(Name); + VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name); BB2VPBB[BB] = VPBB; // Get or create a region for the loop containing BB. @@ -204,7 +204,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { if (LoopOfBB == TheLoop) { RegionOfVPBB = Plan.getVectorLoopRegion(); } else { - RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/); + RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/); RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]); } RegionOfVPBB->setEntry(VPBB); @@ -357,12 +357,10 @@ void PlainCFGBuilder::buildPlainCFG() { BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB; VectorHeaderVPBB->clearSuccessors(); VectorLatchVPBB->clearPredecessors(); - if (TheLoop->getHeader() != TheLoop->getLoopLatch()) { + if (TheLoop->getHeader() != TheLoop->getLoopLatch()) BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB; - } else { + else TheRegion->setExiting(VectorHeaderVPBB); - delete VectorLatchVPBB; - } // 1. Scan the body of the loop in a topological order to visit each basic // block after having visited its predecessor basic blocks. Create a VPBB for diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h index 9e8f9f3f4002..ad6e2ad90a96 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h @@ -32,11 +32,11 @@ class Loop; class LoopInfo; class VPRegionBlock; class VPlan; -class VPlanTestBase; +class VPlanTestIRBase; /// Main class to build the VPlan H-CFG for an incoming IR. class VPlanHCFGBuilder { - friend VPlanTestBase; + friend VPlanTestIRBase; private: // The outermost loop of the input loop nest considered for vectorization. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 18e5e2996c82..ec3c203a61b3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -78,6 +78,8 @@ template <unsigned BitWidth = 0> struct specific_intval { if (!VPV->isLiveIn()) return false; Value *V = VPV->getLiveInIRValue(); + if (!V) + return false; const auto *CI = dyn_cast<ConstantInt>(V); if (!CI && V->getType()->isVectorTy()) if (const auto *C = dyn_cast<Constant>(V)) @@ -136,7 +138,8 @@ struct MatchRecipeAndOpcode<Opcode, RecipeTy> { // Check for recipes that do not have opcodes. if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value || std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value || - std::is_same<RecipeTy, VPWidenSelectRecipe>::value) + std::is_same<RecipeTy, VPWidenSelectRecipe>::value || + std::is_same<RecipeTy, VPDerivedIVRecipe>::value) return DefR; else return DefR && DefR->getOpcode() == Opcode; @@ -382,6 +385,17 @@ inline VPScalarIVSteps_match<Op0_t, Op1_t> m_ScalarIVSteps(const Op0_t &Op0, const Op1_t &Op1) { return VPScalarIVSteps_match<Op0_t, Op1_t>(Op0, Op1); } + +template <typename Op0_t, typename Op1_t, typename Op2_t> +using VPDerivedIV_match = + Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, 0, false, VPDerivedIVRecipe>; + +template <typename Op0_t, typename Op1_t, typename Op2_t> +inline VPDerivedIV_match<Op0_t, Op1_t, Op2_t> +m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2}); +} + } // namespace VPlanPatternMatch } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ef5f6e22f822..77c08839dbfa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -57,6 +57,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case Instruction::Or: case Instruction::ICmp: case Instruction::Select: + case VPInstruction::AnyOf: case VPInstruction::Not: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: @@ -361,6 +362,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::PtrAdd: case VPInstruction::ExplicitVectorLength: + case VPInstruction::AnyOf: return true; default: return false; @@ -565,6 +567,9 @@ Value *VPInstruction::generate(VPTransformState &State) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); + else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) + ReducedPartRdx = + createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart); else ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); } @@ -573,7 +578,8 @@ Value *VPInstruction::generate(VPTransformState &State) { // Create the reduction after the loop. Note that inloop reductions create // the target reduction in the loop using a Reduction recipe. if ((State.VF.isVector() || - RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && + RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || + RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && !PhiR->isInLoop()) { ReducedPartRdx = createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); @@ -615,8 +621,7 @@ Value *VPInstruction::generate(VPTransformState &State) { "can only generate first lane for PtrAdd"); Value *Ptr = State.get(getOperand(0), VPLane(0)); Value *Addend = State.get(getOperand(1), VPLane(0)); - return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name) - : Builder.CreatePtrAdd(Ptr, Addend, Name); + return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); } case VPInstruction::ResumePhi: { Value *IncomingFromVPlanPred = @@ -624,18 +629,22 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *IncomingFromOtherPreds = State.get(getOperand(1), /* IsScalar */ true); auto *NewPhi = - Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name); + Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name); BasicBlock *VPlanPred = State.CFG - .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())]; + .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getPredecessors()[0])]; NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { - assert(OtherPred != VPlanPred && - "VPlan predecessors should not be connected yet"); + if (OtherPred == VPlanPred) + continue; NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); } return NewPhi; } + case VPInstruction::AnyOf: { + Value *A = State.get(getOperand(0)); + return Builder.CreateOrReduce(A); + } default: llvm_unreachable("Unsupported opcode for instruction"); @@ -644,7 +653,8 @@ Value *VPInstruction::generate(VPTransformState &State) { bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractFromEnd || - getOpcode() == VPInstruction::ComputeReductionResult; + getOpcode() == VPInstruction::ComputeReductionResult || + getOpcode() == VPInstruction::AnyOf; } bool VPInstruction::isSingleScalar() const { @@ -707,6 +717,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { return false; case Instruction::ICmp: case Instruction::Select: + case Instruction::Or: case VPInstruction::PtrAdd: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); @@ -802,6 +813,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::PtrAdd: O << "ptradd"; break; + case VPInstruction::AnyOf: + O << "any-of"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -819,12 +833,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, void VPIRInstruction::execute(VPTransformState &State) { assert((isa<PHINode>(&I) || getNumOperands() == 0) && "Only PHINodes can have extra operands"); - if (getNumOperands() == 1) { - VPValue *ExitValue = getOperand(0); + for (const auto &[Idx, Op] : enumerate(operands())) { + VPValue *ExitValue = Op; auto Lane = vputils::isUniformAfterVectorization(ExitValue) ? VPLane::getFirstLane() : VPLane::getLastLaneForVF(State.VF); - auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor()); + VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; + auto *PredVPBB = Pred->getExitingBasicBlock(); BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; // Set insertion point in PredBB in case an extract needs to be generated. // TODO: Model extracts explicitly. @@ -857,11 +872,13 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, O << Indent << "IR " << I; if (getNumOperands() != 0) { - assert(getNumOperands() == 1 && "can have at most 1 operand"); - O << " (extra operand: "; - getOperand(0)->printAsOperand(O, SlotTracker); - O << " from "; - getParent()->getPredecessors()[0]->printAsOperand(O); + O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; + interleaveComma( + enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { + Op.value()->printAsOperand(O, SlotTracker); + O << " from "; + getParent()->getPredecessors()[Op.index()]->printAsOperand(O); + }); O << ")"; } } @@ -950,7 +967,8 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { // Some intrinsics have a scalar argument - don't replace it with a // vector. Value *Arg; - if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) + if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(), + State.TTI)) Arg = State.get(I.value(), VPLane(0)); else Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); @@ -964,7 +982,8 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { Module *M = State.Builder.GetInsertBlock()->getModule(); Function *VectorF = Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); - assert(VectorF && "Can't retrieve vector intrinsic."); + assert(VectorF && + "Can't retrieve vector intrinsic or vector-predication intrinsics."); auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); SmallVector<OperandBundleDef, 1> OpBundles; @@ -1012,11 +1031,11 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, Arguments.push_back(V); } - Type *RetTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); SmallVector<Type *> ParamTys; for (unsigned I = 0; I != getNumOperands(); ++I) ParamTys.push_back( - ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); + toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); @@ -1184,7 +1203,7 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, SelectInst *SI = cast<SelectInst>(getUnderlyingValue()); bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); Type *ScalarTy = Ctx.Types.inferScalarType(this); - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; VPValue *Op0, *Op1; @@ -1254,8 +1273,12 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { getFastMathFlags().print(O); break; case OperationType::GEPOp: - if (GEPFlags.IsInBounds) + if (GEPFlags.isInBounds()) O << " inbounds"; + else if (GEPFlags.hasNoUnsignedSignedWrap()) + O << " nusw"; + if (GEPFlags.hasNoUnsignedWrap()) + O << " nuw"; break; case OperationType::NonNegOp: if (NonNegFlags.NonNeg) @@ -1361,7 +1384,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; switch (Opcode) { case Instruction::FNeg: { - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticInstrCost( Opcode, VectorTy, CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, @@ -1399,7 +1422,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && getOperand(1)->isDefinedOutsideLoopRegions()) RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); SmallVector<const Value *, 4> Operands; @@ -1412,13 +1435,13 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, } case Instruction::Freeze: { // This opcode is unknown. Assume that it is the same as 'mul'. - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); } case Instruction::ICmp: case Instruction::FCmp: { Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); - Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), CostKind, {TTI::OK_AnyValue, TTI::OP_None}, @@ -1546,8 +1569,8 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, } auto *SrcTy = - cast<VectorType>(ToVectorTy(Ctx.Types.inferScalarType(Operand), VF)); - auto *DestTy = cast<VectorType>(ToVectorTy(getResultType(), VF)); + cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF)); + auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF)); // Arm TTI will use the underlying instruction to determine the cost. return Ctx.TTI.getCastInstrCost( Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput, @@ -1559,7 +1582,7 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-CAST "; printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(Opcode) << " "; + O << " = " << Instruction::getOpcodeName(Opcode); printFlags(O); printOperands(O, SlotTracker); O << " to " << *getResultType(); @@ -1572,10 +1595,10 @@ InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, } /// This function adds -/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) -/// to each vector element of Val. The sequence starts at StartIndex. +/// (0 * Step, 1 * Step, 2 * Step, ...) +/// to each vector element of Val. /// \p Opcode is relevant for FP induction variable. -static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, +static Value *getStepVector(Value *Val, Value *Step, Instruction::BinaryOps BinOp, ElementCount VF, IRBuilderBase &Builder) { assert(VF.isVector() && "only vector VFs are supported"); @@ -1600,11 +1623,7 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, } Value *InitVec = Builder.CreateStepVector(InitVecValVTy); - // Splat the StartIdx - Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); - if (STy->isIntegerTy()) { - InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); Step = Builder.CreateVectorSplat(VLen, Step); assert(Step->getType() == Val->getType() && "Invalid step vec"); // FIXME: The newly created binary instructions should contain nsw/nuw @@ -1617,7 +1636,6 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && "Binary Opcode should be specified for FP induction"); InitVec = Builder.CreateUIToFP(InitVec, ValVTy); - InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); Step = Builder.CreateVectorSplat(VLen, Step); Value *MulOp = Builder.CreateFMul(InitVec, Step); @@ -1638,12 +1656,13 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { const InductionDescriptor &ID = getInductionDescriptor(); TruncInst *Trunc = getTruncInst(); IRBuilderBase &Builder = State.Builder; - assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); + assert(getPHINode()->getType() == ID.getStartValue()->getType() && + "Types must match"); assert(State.VF.isVector() && "must have vector VF"); // The value from the original loop to which we are mapping the new induction // variable. - Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; + Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode(); // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(Builder); @@ -1668,10 +1687,9 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } - Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); - Value *SteppedStart = getStepVector( - SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); + Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(), + State.VF, State.Builder); // We create vector phi nodes for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. @@ -1711,14 +1729,14 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { // factor. The last of those goes into the PHI. PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); - VecInd->setDebugLoc(EntryVal->getDebugLoc()); + VecInd->setDebugLoc(getDebugLoc()); State.set(this, VecInd); Instruction *LastInduction = cast<Instruction>( Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next")); if (isa<TruncInst>(EntryVal)) State.addMetadata(LastInduction, EntryVal); - LastInduction->setDebugLoc(EntryVal->getDebugLoc()); + LastInduction->setDebugLoc(getDebugLoc()); VecInd->addIncoming(SteppedStart, VectorPH); // Add induction update using an incorrect block temporarily. The phi node @@ -1732,20 +1750,13 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-INDUCTION"; - if (getTruncInst()) { - O << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; - O << " +\n" << Indent << "\" "; - getVPValue(0)->printAsOperand(O, SlotTracker); - } else - O << " " << VPlanIngredient(IV); - - O << ", "; - getStepValue()->printAsOperand(O, SlotTracker); + O << Indent; + printAsOperand(O, SlotTracker); + O << " = WIDEN-INDUCTION "; + printOperands(O, SlotTracker); - O << ", "; - getVFValue()->printAsOperand(O, SlotTracker); + if (auto *TI = getTruncInst()) + O << " (truncated to " << *TI->getType() << ")"; } #endif @@ -1896,9 +1907,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { for (unsigned I = 0, E = getNumOperands(); I != E; I++) Ops.push_back(State.get(getOperand(I), VPLane(0))); - auto *NewGEP = - State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], - ArrayRef(Ops).drop_front(), "", isInBounds()); + auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], + ArrayRef(Ops).drop_front(), "", + getGEPNoWrapFlags()); Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); State.set(this, Splat); State.addMetadata(Splat, GEP); @@ -1924,7 +1935,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { // Create the new GEP. Note that this GEP may be a scalar if VF == 1, // but it should be a vector, otherwise. auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, - Indices, "", isInBounds()); + Indices, "", getGEPNoWrapFlags()); assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); State.set(this, NewGEP); @@ -1975,9 +1986,10 @@ void VPReverseVectorPointerRecipe::execute(VPTransformState &State) { // LastLane = 1 - RunTimeVF Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); Value *Ptr = State.get(getOperand(0), VPLane(0)); - bool InBounds = isInBounds(); - Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); - ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", InBounds); + Value *ResultPtr = + Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); + ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", + getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } @@ -1987,9 +1999,8 @@ void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); - O << " = reverse-vector-pointer "; - if (isInBounds()) - O << "inbounds "; + O << " = reverse-vector-pointer"; + printFlags(O); printOperands(O, SlotTracker); } #endif @@ -2001,10 +2012,10 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) { Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, CurrentPart, Builder); Value *Ptr = State.get(getOperand(0), VPLane(0)); - bool InBounds = isInBounds(); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); - Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); + Value *ResultPtr = + Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } @@ -2066,8 +2077,8 @@ InstructionCost VPBlendRecipe::computeCost(ElementCount VF, if (vputils::onlyFirstLaneUsed(this)) return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind); - Type *ResultTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); - Type *CmpTy = ToVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); + Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); return (getNumIncomingValues() - 1) * Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, CmpInst::BAD_ICMP_PREDICATE, CostKind); @@ -2104,6 +2115,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + State.setDebugLocFrom(getDebugLoc()); Value *NewVecOp = State.get(getVecOp()); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, State.VF.isScalar()); @@ -2188,7 +2200,7 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { RecurKind RdxKind = RdxDesc.getRecurrenceKind(); Type *ElementTy = Ctx.Types.inferScalarType(this); - auto *VectorTy = cast<VectorType>(ToVectorTy(ElementTy, VF)); + auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF)); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; unsigned Opcode = RdxDesc.getOpcode(); @@ -2380,6 +2392,7 @@ InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, } void VPPredInstPHIRecipe::execute(VPTransformState &State) { + State.setDebugLocFrom(getDebugLoc()); assert(State.Lane && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = cast<Instruction>(State.get(getOperand(0), *State.Lane)); @@ -2439,7 +2452,7 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); + Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); const Align Alignment = getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); unsigned AS = @@ -2586,7 +2599,7 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, // legacy model, it will always calculate the cost of mask. // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. - Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); + Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); const Align Alignment = getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); unsigned AS = @@ -2707,7 +2720,7 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, // legacy model, it will always calculate the cost of mask. // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. - Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF); + Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); const Align Alignment = getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient)); unsigned AS = @@ -3075,7 +3088,7 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, Type *ValTy = Ctx.Types.inferScalarType( getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) : getStoredValues()[InsertPosIdx]); - auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); + auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(InsertPos); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -3111,31 +3124,14 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -bool VPCanonicalIVPHIRecipe::isCanonical( - InductionDescriptor::InductionKind Kind, VPValue *Start, - VPValue *Step) const { - // Must be an integer induction. - if (Kind != InductionDescriptor::IK_IntInduction) - return false; - // Start must match the start value of this canonical induction. - if (Start != getStartValue()) - return false; - - // If the step is defined by a recipe, it is not a ConstantInt. - if (Step->getDefiningRecipe()) - return false; - - ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); - return StepC && StepC->isOne(); -} - bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { return IsScalarAfterVectorization && (!IsScalable || vputils::onlyFirstLaneUsed(this)); } void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { - assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && + assert(getInductionDescriptor().getKind() == + InductionDescriptor::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"); assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && "Unexpected type."); @@ -3160,6 +3156,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV->getIterator()); NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); + NewPointerPhi->setDebugLoc(getDebugLoc()); } else { // The recipe has been unrolled. In that case, fetch the single pointer phi // shared among all unrolled parts of the recipe. @@ -3170,8 +3167,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { // A pointer induction, performed by using a gep BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); - Value *ScalarStepValue = State.get(getOperand(1), VPLane(0)); - Type *PhiType = IndDesc.getStep()->getType(); + Value *ScalarStepValue = State.get(getStepValue(), VPLane(0)); + Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue()); Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); // Add induction update using an incorrect block temporarily. The phi node // will be fixed after VPlan execution. Note that at this point the latch @@ -3223,7 +3220,8 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = WIDEN-POINTER-INDUCTION "; getStartValue()->printAsOperand(O, SlotTracker); - O << ", " << *IndDesc.getStep(); + O << ", "; + getStepValue()->printAsOperand(O, SlotTracker); if (getNumOperands() == 4) { O << ", "; getOperand(2)->printAsOperand(O, SlotTracker); @@ -3235,13 +3233,22 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, void VPExpandSCEVRecipe::execute(VPTransformState &State) { assert(!State.Lane && "cannot be used in per-lane"); + if (State.ExpandedSCEVs.contains(Expr)) { + // SCEV Expr has already been expanded, result must already be set. At the + // moment we have to execute the entry block twice (once before skeleton + // creation to get expanded SCEVs used by the skeleton and once during + // regular VPlan execution). + State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]); + assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] && + "Results must match"); + return; + } + const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); SCEVExpander Exp(SE, DL, "induction"); Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), &*State.Builder.GetInsertPoint()); - assert(!State.ExpandedSCEVs.contains(Expr) && - "Same SCEV expanded multiple times"); State.ExpandedSCEVs[Expr] = Res; State.set(this, Res, VPLane(0)); } @@ -3324,7 +3331,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, SmallVector<int> Mask(VF.getKnownMinValue()); std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); Type *VectorTy = - ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); + toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, cast<VectorType>(VectorTy), Mask, CostKind, @@ -3358,7 +3365,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { : VectorType::get(StartV->getType(), State.VF); BasicBlock *HeaderBB = State.CFG.PrevBB; - assert(State.CurrentVectorLoop->getHeader() == HeaderBB && + assert(State.CurrentParentLoop->getHeader() == HeaderBB && "recipe must be in the vector loop header"); auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); Phi->insertBefore(HeaderBB->getFirstInsertionPt()); @@ -3380,6 +3387,22 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { Builder.SetInsertPoint(VectorPH->getTerminator()); StartV = Iden = State.get(StartVPV); } + } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { + // [I|F]FindLastIV will use a sentinel value to initialize the reduction + // phi or the resume value from the main vector loop when vectorizing the + // epilogue loop. In the exit block, ComputeReductionResult will generate + // checks to verify if the reduction result is the sentinel value. If the + // result is the sentinel value, it will be corrected back to the start + // value. + // TODO: The sentinel value is not always necessary. When the start value is + // a constant, and smaller than the start value of the induction variable, + // the start value can be directly used to initialize the reduction phi. + Iden = StartV; + if (!ScalarPHI) { + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(VectorPH->getTerminator()); + StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden); + } } else { Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags()); @@ -3483,7 +3506,7 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, void VPScalarPHIRecipe::execute(VPTransformState &State) { BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); - Value *Start = State.get(getOperand(0), VPLane(0)); + Value *Start = State.get(getStartValue(), VPLane(0)); PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name); Phi->addIncoming(Start, VectorPH); Phi->setDebugLoc(getDebugLoc()); @@ -3493,7 +3516,7 @@ void VPScalarPHIRecipe::execute(VPTransformState &State) { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "SCALAR-PHI"; + O << Indent << "SCALAR-PHI "; printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index cee83d1015b5..8ac2bd5160c2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -61,8 +61,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue()); VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, - &Plan->getVF(), *II); + NewRecipe = new VPWidenIntOrFpInductionRecipe( + Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc()); } else { assert(isa<VPInstruction>(&Ingredient) && "only VPInstructions expected here"); @@ -217,7 +217,7 @@ static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { // is connected to a successor replicate region with the same predicate by a // single, empty VPBasicBlock. static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { - SetVector<VPRegionBlock *> DeletedRegions; + SmallPtrSet<VPRegionBlock *, 4> TransformedRegions; // Collect replicate regions followed by an empty block, followed by another // replicate region with matching masks to process front. This is to avoid @@ -248,7 +248,7 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { // Move recipes from Region1 to its successor region, if both are triangles. for (VPRegionBlock *Region1 : WorkList) { - if (DeletedRegions.contains(Region1)) + if (TransformedRegions.contains(Region1)) continue; auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor()); auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor()); @@ -294,12 +294,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock); } VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock); - DeletedRegions.insert(Region1); + TransformedRegions.insert(Region1); } - for (VPRegionBlock *ToDelete : DeletedRegions) - delete ToDelete; - return !DeletedRegions.empty(); + return !TransformedRegions.empty(); } static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, @@ -310,7 +308,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, assert(Instr->getParent() && "Predicated instruction not in any basic block"); auto *BlockInMask = PredRecipe->getMask(); auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); - auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); + auto *Entry = + Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); // Replace predicated replicate recipe with a replicate recipe without a // mask but in the replicate region. @@ -318,17 +317,21 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, PredRecipe->getUnderlyingInstr(), make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())), PredRecipe->isUniform()); - auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); + auto *Pred = + Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); VPPredInstPHIRecipe *PHIRecipe = nullptr; if (PredRecipe->getNumUsers() != 0) { - PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask); + PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask, + RecipeWithoutMask->getDebugLoc()); PredRecipe->replaceAllUsesWith(PHIRecipe); PHIRecipe->setOperand(0, RecipeWithoutMask); } PredRecipe->eraseFromParent(); - auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); + auto *Exiting = + Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + VPRegionBlock *Region = + Plan.createVPRegionBlock(Entry, Exiting, RegionName, true); // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. @@ -377,7 +380,8 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) { continue; auto *PredVPBB = dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor()); - if (!PredVPBB || PredVPBB->getNumSuccessors() != 1) + if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 || + isa<VPIRBasicBlock>(PredVPBB)) continue; WorkList.push_back(VPBB); } @@ -394,7 +398,7 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) { VPBlockUtils::disconnectBlocks(VPBB, Succ); VPBlockUtils::connectBlocks(PredVPBB, Succ); } - delete VPBB; + // VPBB is now dead and will be cleaned up when the plan gets destroyed. } return !WorkList.empty(); } @@ -526,11 +530,8 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, VPValue *StartV, VPValue *Step, VPBuilder &Builder) { VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); - VPSingleDefRecipe *BaseIV = CanonicalIV; - if (!CanonicalIV->isCanonical(Kind, StartV, Step)) { - BaseIV = Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step, - "offset.idx"); - } + VPSingleDefRecipe *BaseIV = Builder.createDerivedIV( + Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx"); // Truncate base induction if needed. Type *CanonicalIVType = CanonicalIV->getScalarType(); @@ -661,6 +662,151 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) { } } +/// Try to simplify recipe \p R. +static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { + using namespace llvm::VPlanPatternMatch; + + if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { + // Try to remove redundant blend recipes. + SmallPtrSet<VPValue *, 4> UniqueValues; + if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) + UniqueValues.insert(Blend->getIncomingValue(0)); + for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) + if (!match(Blend->getMask(I), m_False())) + UniqueValues.insert(Blend->getIncomingValue(I)); + + if (UniqueValues.size() == 1) { + Blend->replaceAllUsesWith(*UniqueValues.begin()); + Blend->eraseFromParent(); + return; + } + + if (Blend->isNormalized()) + return; + + // Normalize the blend so its first incoming value is used as the initial + // value with the others blended into it. + + unsigned StartIndex = 0; + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + // If a value's mask is used only by the blend then is can be deadcoded. + // TODO: Find the most expensive mask that can be deadcoded, or a mask + // that's used by multiple blends where it can be removed from them all. + VPValue *Mask = Blend->getMask(I); + if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { + StartIndex = I; + break; + } + } + + SmallVector<VPValue *, 4> OperandsWithMask; + OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); + + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + if (I == StartIndex) + continue; + OperandsWithMask.push_back(Blend->getIncomingValue(I)); + OperandsWithMask.push_back(Blend->getMask(I)); + } + + auto *NewBlend = new VPBlendRecipe( + cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask); + NewBlend->insertBefore(&R); + + VPValue *DeadMask = Blend->getMask(StartIndex); + Blend->replaceAllUsesWith(NewBlend); + Blend->eraseFromParent(); + recursivelyDeleteDeadRecipes(DeadMask); + return; + } + + VPValue *A; + if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) { + VPValue *Trunc = R.getVPSingleValue(); + Type *TruncTy = TypeInfo.inferScalarType(Trunc); + Type *ATy = TypeInfo.inferScalarType(A); + if (TruncTy == ATy) { + Trunc->replaceAllUsesWith(A); + } else { + // Don't replace a scalarizing recipe with a widened cast. + if (isa<VPReplicateRecipe>(&R)) + return; + if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { + + unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) + ? Instruction::SExt + : Instruction::ZExt; + auto *VPC = + new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { + // UnderlyingExt has distinct return type, used to retain legacy cost. + VPC->setUnderlyingValue(UnderlyingExt); + } + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { + auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } + } +#ifndef NDEBUG + // Verify that the cached type info is for both A and its users is still + // accurate by comparing it to freshly computed types. + VPTypeAnalysis TypeInfo2( + R.getParent()->getPlan()->getCanonicalIV()->getScalarType()); + assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); + for (VPUser *U : A->users()) { + auto *R = cast<VPRecipeBase>(U); + for (VPValue *VPV : R->definedValues()) + assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); + } +#endif + } + + // Simplify (X && Y) || (X && !Y) -> X. + // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X + // && (Y || Z) and (X || !X) into true. This requires queuing newly created + // recipes to be visited during simplification. + VPValue *X, *Y, *X1, *Y1; + if (match(&R, + m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) && + X == X1 && Y == Y1) { + R.getVPSingleValue()->replaceAllUsesWith(X); + R.eraseFromParent(); + return; + } + + if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) + return R.getVPSingleValue()->replaceAllUsesWith(A); + + if (match(&R, m_Not(m_Not(m_VPValue(A))))) + return R.getVPSingleValue()->replaceAllUsesWith(A); + + // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. + if ((match(&R, + m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) || + match(&R, + m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) && + TypeInfo.inferScalarType(R.getOperand(1)) == + TypeInfo.inferScalarType(R.getVPSingleValue())) + return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1)); +} + +/// Try to simplify the recipes in \p Plan +static void simplifyRecipes(VPlan &Plan) { + ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( + Plan.getEntry()); + Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); + VPTypeAnalysis TypeInfo(CanonicalIVType); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + simplifyRecipe(R, TypeInfo); + } + } +} + void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE) { @@ -696,11 +842,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, VPInstruction::BranchOnCond, {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); - SmallVector<VPValue *> PossiblyDead(Term->operands()); Term->eraseFromParent(); - for (VPValue *Op : PossiblyDead) - recursivelyDeleteDeadRecipes(Op); ExitingVPBB->appendRecipe(BOC); + + VPlanTransforms::removeDeadRecipes(Plan); + Plan.setVF(BestVF); Plan.setUF(BestUF); // TODO: Further simplifications are possible @@ -941,126 +1087,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { } } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { - using namespace llvm::VPlanPatternMatch; - - if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { - // Try to remove redundant blend recipes. - SmallPtrSet<VPValue *, 4> UniqueValues; - if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) - UniqueValues.insert(Blend->getIncomingValue(0)); - for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) - if (!match(Blend->getMask(I), m_False())) - UniqueValues.insert(Blend->getIncomingValue(I)); - - if (UniqueValues.size() == 1) { - Blend->replaceAllUsesWith(*UniqueValues.begin()); - Blend->eraseFromParent(); - return; - } - - if (Blend->isNormalized()) - return; - - // Normalize the blend so its first incoming value is used as the initial - // value with the others blended into it. - - unsigned StartIndex = 0; - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - // If a value's mask is used only by the blend then is can be deadcoded. - // TODO: Find the most expensive mask that can be deadcoded, or a mask - // that's used by multiple blends where it can be removed from them all. - VPValue *Mask = Blend->getMask(I); - if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { - StartIndex = I; - break; - } - } - - SmallVector<VPValue *, 4> OperandsWithMask; - OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); - - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - if (I == StartIndex) - continue; - OperandsWithMask.push_back(Blend->getIncomingValue(I)); - OperandsWithMask.push_back(Blend->getMask(I)); - } - - auto *NewBlend = new VPBlendRecipe( - cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask); - NewBlend->insertBefore(&R); - - VPValue *DeadMask = Blend->getMask(StartIndex); - Blend->replaceAllUsesWith(NewBlend); - Blend->eraseFromParent(); - recursivelyDeleteDeadRecipes(DeadMask); - return; - } - - VPValue *A; - if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) { - VPValue *Trunc = R.getVPSingleValue(); - Type *TruncTy = TypeInfo.inferScalarType(Trunc); - Type *ATy = TypeInfo.inferScalarType(A); - if (TruncTy == ATy) { - Trunc->replaceAllUsesWith(A); - } else { - // Don't replace a scalarizing recipe with a widened cast. - if (isa<VPReplicateRecipe>(&R)) - return; - if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - - unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) - ? Instruction::SExt - : Instruction::ZExt; - auto *VPC = - new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); - if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { - // UnderlyingExt has distinct return type, used to retain legacy cost. - VPC->setUnderlyingValue(UnderlyingExt); - } - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); - } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { - auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); - } - } -#ifndef NDEBUG - // Verify that the cached type info is for both A and its users is still - // accurate by comparing it to freshly computed types. - VPTypeAnalysis TypeInfo2( - R.getParent()->getPlan()->getCanonicalIV()->getScalarType()); - assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); - for (VPUser *U : A->users()) { - auto *R = cast<VPRecipeBase>(U); - for (VPValue *VPV : R->definedValues()) - assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); - } -#endif - } - - // Simplify (X && Y) || (X && !Y) -> X. - // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X - // && (Y || Z) and (X || !X) into true. This requires queuing newly created - // recipes to be visited during simplification. - VPValue *X, *Y, *X1, *Y1; - if (match(&R, - m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), - m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) && - X == X1 && Y == Y1) { - R.getVPSingleValue()->replaceAllUsesWith(X); - R.eraseFromParent(); - return; - } - - if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) - return R.getVPSingleValue()->replaceAllUsesWith(A); -} - /// Move loop-invariant recipes out of the vector loop region in \p Plan. static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); @@ -1095,19 +1121,6 @@ static void licm(VPlan &Plan) { } } -/// Try to simplify the recipes in \p Plan. -static void simplifyRecipes(VPlan &Plan) { - ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( - Plan.getEntry()); - Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); - VPTypeAnalysis TypeInfo(CanonicalIVType); - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - simplifyRecipe(R, TypeInfo); - } - } -} - void VPlanTransforms::truncateToMinimalBitwidths( VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) { #ifndef NDEBUG @@ -1247,11 +1260,11 @@ void VPlanTransforms::optimize(VPlan &Plan) { simplifyRecipes(Plan); legalizeAndOptimizeInductions(Plan); + removeRedundantExpandSCEVRecipes(Plan); + simplifyRecipes(Plan); removeDeadRecipes(Plan); createAndOptimizeReplicateRegions(Plan); - - removeRedundantExpandSCEVRecipes(Plan); mergeBlocksIntoPredecessors(Plan); licm(Plan); } @@ -1438,112 +1451,134 @@ void VPlanTransforms::addActiveLaneMask( HeaderMask->replaceAllUsesWith(LaneMask); } +/// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns +/// nullptr if no EVL-based recipe could be created. +/// \p HeaderMask Header Mask. +/// \p CurRecipe Recipe to be transform. +/// \p TypeInfo VPlan-based type analysis. +/// \p AllOneMask The vector mask parameter of vector-predication intrinsics. +/// \p EVL The explicit vector length parameter of vector-predication +/// intrinsics. +static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask, + VPRecipeBase &CurRecipe, + VPTypeAnalysis &TypeInfo, + VPValue &AllOneMask, VPValue &EVL) { + using namespace llvm::VPlanPatternMatch; + auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { + assert(OrigMask && "Unmasked recipe when folding tail"); + return HeaderMask == OrigMask ? nullptr : OrigMask; + }; + + return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) + .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { + VPValue *NewMask = GetNewMask(L->getMask()); + return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); + }) + .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { + VPValue *NewMask = GetNewMask(S->getMask()); + return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); + }) + .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * { + unsigned Opcode = W->getOpcode(); + if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) + return nullptr; + return new VPWidenEVLRecipe(*W, EVL); + }) + .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { + VPValue *NewMask = GetNewMask(Red->getCondOp()); + return new VPReductionEVLRecipe(*Red, EVL, NewMask); + }) + .Case<VPWidenIntrinsicRecipe, VPWidenCastRecipe>( + [&](auto *CR) -> VPRecipeBase * { + Intrinsic::ID VPID; + if (auto *CallR = dyn_cast<VPWidenIntrinsicRecipe>(CR)) { + VPID = + VPIntrinsic::getForIntrinsic(CallR->getVectorIntrinsicID()); + } else { + auto *CastR = cast<VPWidenCastRecipe>(CR); + VPID = VPIntrinsic::getForOpcode(CastR->getOpcode()); + } + assert(VPID != Intrinsic::not_intrinsic && "Expected VP intrinsic"); + assert(VPIntrinsic::getMaskParamPos(VPID) && + VPIntrinsic::getVectorLengthParamPos(VPID) && + "Expected VP intrinsic"); + + SmallVector<VPValue *> Ops(CR->operands()); + Ops.push_back(&AllOneMask); + Ops.push_back(&EVL); + return new VPWidenIntrinsicRecipe( + VPID, Ops, TypeInfo.inferScalarType(CR), CR->getDebugLoc()); + }) + .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) { + SmallVector<VPValue *> Ops(Sel->operands()); + Ops.push_back(&EVL); + return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops, + TypeInfo.inferScalarType(Sel), + Sel->getDebugLoc()); + }) + .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * { + VPValue *LHS, *RHS; + // Transform select with a header mask condition + // select(header_mask, LHS, RHS) + // into vector predication merge. + // vp.merge(all-true, LHS, RHS, EVL) + if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), + m_VPValue(RHS)))) + return nullptr; + // Use all true as the condition because this transformation is + // limited to selects whose condition is a header mask. + return new VPWidenIntrinsicRecipe( + Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, + TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); + }) + .Default([&](VPRecipeBase *R) { return nullptr; }); +} + /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { - using namespace llvm::VPlanPatternMatch; Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); VPTypeAnalysis TypeInfo(CanonicalIVType); LLVMContext &Ctx = CanonicalIVType->getContext(); - SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan); + VPValue *AllOneMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); for (VPUser *U : Plan.getVF().users()) { if (auto *R = dyn_cast<VPReverseVectorPointerRecipe>(U)) R->setOperand(1, &EVL); } + SmallVector<VPRecipeBase *> ToErase; + for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { for (VPUser *U : collectUsersRecursively(HeaderMask)) { auto *CurRecipe = cast<VPRecipeBase>(U); - auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { - assert(OrigMask && "Unmasked recipe when folding tail"); - return HeaderMask == OrigMask ? nullptr : OrigMask; - }; - - VPRecipeBase *NewRecipe = - TypeSwitch<VPRecipeBase *, VPRecipeBase *>(CurRecipe) - .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { - VPValue *NewMask = GetNewMask(L->getMask()); - return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); - }) - .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { - VPValue *NewMask = GetNewMask(S->getMask()); - return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); - }) - .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * { - unsigned Opcode = W->getOpcode(); - if (!Instruction::isBinaryOp(Opcode) && - !Instruction::isUnaryOp(Opcode)) - return nullptr; - return new VPWidenEVLRecipe(*W, EVL); - }) - .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { - VPValue *NewMask = GetNewMask(Red->getCondOp()); - return new VPReductionEVLRecipe(*Red, EVL, NewMask); - }) - .Case<VPWidenIntrinsicRecipe>( - [&](VPWidenIntrinsicRecipe *CInst) -> VPRecipeBase * { - auto *CI = cast<CallInst>(CInst->getUnderlyingInstr()); - Intrinsic::ID VPID = VPIntrinsic::getForIntrinsic( - CI->getCalledFunction()->getIntrinsicID()); - if (VPID == Intrinsic::not_intrinsic) - return nullptr; - - SmallVector<VPValue *> Ops(CInst->operands()); - assert(VPIntrinsic::getMaskParamPos(VPID) && - VPIntrinsic::getVectorLengthParamPos(VPID) && - "Expected VP intrinsic"); - VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue( - IntegerType::getInt1Ty(CI->getContext()))); - Ops.push_back(Mask); - Ops.push_back(&EVL); - return new VPWidenIntrinsicRecipe( - *CI, VPID, Ops, TypeInfo.inferScalarType(CInst), - CInst->getDebugLoc()); - }) - .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) { - SmallVector<VPValue *> Ops(Sel->operands()); - Ops.push_back(&EVL); - return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops, - TypeInfo.inferScalarType(Sel), - Sel->getDebugLoc()); - }) - .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * { - VPValue *LHS, *RHS; - // Transform select with a header mask condition - // select(header_mask, LHS, RHS) - // into vector predication merge. - // vp.merge(all-true, LHS, RHS, EVL) - if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), - m_VPValue(RHS)))) - return nullptr; - // Use all true as the condition because this transformation is - // limited to selects whose condition is a header mask. - VPValue *AllTrue = - Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); - return new VPWidenIntrinsicRecipe( - Intrinsic::vp_merge, {AllTrue, LHS, RHS, &EVL}, - TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); - }) - .Default([&](VPRecipeBase *R) { return nullptr; }); - - if (!NewRecipe) + VPRecipeBase *EVLRecipe = + createEVLRecipe(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); + if (!EVLRecipe) continue; - [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues(); + [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); assert(NumDefVal == CurRecipe->getNumDefinedValues() && "New recipe must define the same number of values as the " "original."); assert( NumDefVal <= 1 && "Only supports recipes with a single definition or without users."); - NewRecipe->insertBefore(CurRecipe); - if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(NewRecipe)) { + EVLRecipe->insertBefore(CurRecipe); + if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) { VPValue *CurVPV = CurRecipe->getVPSingleValue(); - CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue()); + CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); } - CurRecipe->eraseFromParent(); + // Defer erasing recipes till the end so that we don't invalidate the + // VPTypeAnalysis cache. + ToErase.push_back(CurRecipe); } - recursivelyDeleteDeadRecipes(HeaderMask); + } + + for (VPRecipeBase *R : reverse(ToErase)) { + SmallVector<VPValue *> PossiblyDead(R->operands()); + R->eraseFromParent(); + for (VPValue *Op : PossiblyDead) + recursivelyDeleteDeadRecipes(Op); } } @@ -1667,8 +1702,8 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( // instruction. Widen memory instructions involved in address computation // will lead to gather/scatter instructions, which don't need to be // handled. - if (isa<VPWidenMemoryRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) || - isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec)) + if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe, + VPHeaderPHIRecipe>(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1820,9 +1855,7 @@ void VPlanTransforms::createInterleaveGroups( } } -void VPlanTransforms::prepareToExecute(VPlan &Plan) { - ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( - Plan.getVectorLoopRegion()); +void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) { @@ -1840,3 +1873,62 @@ void VPlanTransforms::prepareToExecute(VPlan &Plan) { } } } + +void VPlanTransforms::handleUncountableEarlyExit( + VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, + BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) { + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + auto *LatchVPBB = cast<VPBasicBlock>(LoopRegion->getExiting()); + VPBuilder Builder(LatchVPBB->getTerminator()); + auto *MiddleVPBB = Plan.getMiddleBlock(); + VPValue *IsEarlyExitTaken = nullptr; + + // Process the uncountable exiting block. Update IsEarlyExitTaken, which + // tracks if the uncountable early exit has been taken. Also split the middle + // block and have it conditionally branch to the early exit block if + // EarlyExitTaken. + auto *EarlyExitingBranch = + cast<BranchInst>(UncountableExitingBlock->getTerminator()); + BasicBlock *TrueSucc = EarlyExitingBranch->getSuccessor(0); + BasicBlock *FalseSucc = EarlyExitingBranch->getSuccessor(1); + + // The early exit block may or may not be the same as the "countable" exit + // block. Creates a new VPIRBB for the early exit block in case it is distinct + // from the countable exit block. + // TODO: Introduce both exit blocks during VPlan skeleton construction. + VPIRBasicBlock *VPEarlyExitBlock; + if (OrigLoop->getUniqueExitBlock()) { + VPEarlyExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]); + } else { + VPEarlyExitBlock = Plan.createVPIRBasicBlock( + !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); + } + + VPValue *EarlyExitNotTakenCond = RecipeBuilder.getBlockInMask( + OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); + auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond); + IsEarlyExitTaken = + Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond}); + + VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split"); + VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle); + VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock); + NewMiddle->swapSuccessors(); + + VPBuilder MiddleBuilder(NewMiddle); + MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken}); + + // Replace the condition controlling the non-early exit from the vector loop + // with one exiting if either the original condition of the vector latch is + // true or the early exit has been taken. + auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator()); + assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount && + "Unexpected terminator"); + auto *IsLatchExitTaken = + Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0), + LatchExitingBranch->getOperand(1)); + auto *AnyExitTaken = Builder.createNaryOp( + Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken}); + Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); + LatchExitingBranch->eraseFromParent(); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 1491e0a8df04..fddde8689116 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -124,8 +124,19 @@ struct VPlanTransforms { /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); + /// Update \p Plan to account for the uncountable early exit block in \p + /// UncountableExitingBlock by + /// * updating the condition exiting the vector loop to include the early + /// exit conditions + /// * splitting the original middle block to branch to the early exit block + /// if taken. + static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, + Loop *OrigLoop, + BasicBlock *UncountableExitingBlock, + VPRecipeBuilder &RecipeBuilder); + /// Lower abstract recipes to concrete ones, that can be codegen'd. - static void prepareToExecute(VPlan &Plan); + static void convertToConcreteRecipes(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index f653269713b3..89e372d6b46c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -168,7 +168,7 @@ void UnrollState::unrollWidenInductionByUF( auto *ConstStep = ScalarStep->isLiveIn() ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue()) : nullptr; - if (!ConstStep || ConstStep->getZExtValue() != 1) { + if (!ConstStep || ConstStep->getValue() != 1) { if (TypeInfo.inferScalarType(ScalarStep) != IVTy) { ScalarStep = Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy); @@ -412,8 +412,6 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { UnrollState Unroller(Plan, UF, Ctx); - Unroller.unrollBlock(Plan.getPreheader()); - // Iterate over all blocks in the plan starting from Entry, and unroll // recipes inside them. This includes the vector preheader and middle blocks, // which may set up or post-process per-part values. diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 4621c28b0512..e40af3e2e3d3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -34,7 +34,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, Expanded = Plan.getOrAddLiveIn(E->getValue()); else { Expanded = new VPExpandSCEVRecipe(Expr, SE); - Plan.getPreheader()->appendRecipe(Expanded->getDefiningRecipe()); + Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe()); } Plan.addSCEVExpansion(Expr, Expanded); return Expanded; diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 71c7d547ac7d..be420a873bef 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -185,7 +185,7 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { RecipeNumbering[&R] = Cnt++; for (const VPRecipeBase &R : *VPBB) { - if (isa<VPIRInstruction>(&R) ^ isa<VPIRBasicBlock>(VPBB)) { + if (isa<VPIRInstruction>(&R) && !isa<VPIRBasicBlock>(VPBB)) { errs() << "VPIRInstructions "; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) R.dump(); diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index b9caf8c0df9b..493ed95b1d22 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -67,9 +67,10 @@ class VectorCombine { public: VectorCombine(Function &F, const TargetTransformInfo &TTI, const DominatorTree &DT, AAResults &AA, AssumptionCache &AC, - const DataLayout *DL, bool TryEarlyFoldsOnly) + const DataLayout *DL, TTI::TargetCostKind CostKind, + bool TryEarlyFoldsOnly) : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL), - TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} + CostKind(CostKind), TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} bool run(); @@ -81,6 +82,7 @@ private: AAResults &AA; AssumptionCache &AC; const DataLayout *DL; + TTI::TargetCostKind CostKind; /// If true, only perform beneficial early IR transforms. Do not introduce new /// vector operations. @@ -113,6 +115,7 @@ private: bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); + bool foldConcatOfBoolMasks(Instruction &I); bool foldPermuteOfBinops(Instruction &I); bool foldShuffleOfBinops(Instruction &I); bool foldShuffleOfCastops(Instruction &I); @@ -125,6 +128,8 @@ private: bool shrinkType(Instruction &I); void replaceValue(Value &Old, Value &New) { + LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); + LLVM_DEBUG(dbgs() << " With: " << New << '\n'); Old.replaceAllUsesWith(&New); if (auto *NewI = dyn_cast<Instruction>(&New)) { New.takeName(&Old); @@ -135,10 +140,18 @@ private: } void eraseInstruction(Instruction &I) { - for (Value *Op : I.operands()) - Worklist.pushValue(Op); + LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n'); + SmallVector<Value *> Ops(I.operands()); Worklist.remove(&I); I.eraseFromParent(); + + // Push remaining users of the operands and then the operand itself - allows + // further folds that were hindered by OneUse limits. + for (Value *Op : Ops) + if (auto *OpI = dyn_cast<Instruction>(Op)) { + Worklist.pushUsersToWorkList(*OpI); + Worklist.pushValue(OpI); + } } }; } // namespace @@ -176,8 +189,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Match insert into fixed vector of scalar value. // TODO: Handle non-zero insert index. Value *Scalar; - if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || - !Scalar->hasOneUse()) + if (!match(&I, + m_InsertElt(m_Poison(), m_OneUse(m_Value(Scalar)), m_ZeroInt()))) return false; // Optionally match an extract from another vector. @@ -247,16 +260,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { Type *LoadTy = Load->getType(); unsigned AS = Load->getPointerAddressSpace(); InstructionCost OldCost = - TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind); APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, /* Insert */ true, HasExtract, CostKind); // New pattern: load VecPtr InstructionCost NewCost = - TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); + TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind); // Optionally, we are shuffling the loaded vector element(s) into place. // For the mask set everything but element 0 to undef to prevent poison from // propagating from the extra loaded memory. This will also optionally @@ -270,7 +282,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); Mask[0] = OffsetEltIndex; if (OffsetEltIndex) - NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask); + NewCost += + TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -329,11 +342,11 @@ bool VectorCombine::widenSubvectorLoad(Instruction &I) { // undef value is 0. We could add that cost if the cost model accurately // reflects the real cost of that operation. InstructionCost OldCost = - TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind); // New pattern: load PtrOp InstructionCost NewCost = - TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS); + TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -366,7 +379,6 @@ ExtractElementInst *VectorCombine::getShuffleExtract( return nullptr; Type *VecTy = Ext0->getVectorOperand()->getType(); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); InstructionCost Cost0 = TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); @@ -420,23 +432,22 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // Get cost estimates for scalar and vector versions of the operation. bool IsBinOp = Instruction::isBinaryOp(Opcode); if (IsBinOp) { - ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); - VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); + ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind); + VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind); } else { assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && "Expected a compare"); CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); ScalarOpCost = TTI.getCmpSelInstrCost( - Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); + Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind); VectorOpCost = TTI.getCmpSelInstrCost( - Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); + Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind); } // Get cost estimates for the extract elements. These costs will factor into // both sequences. unsigned Ext0Index = Ext0IndexC->getZExtValue(); unsigned Ext1Index = Ext1IndexC->getZExtValue(); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Extract0Cost = TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); @@ -596,7 +607,7 @@ bool VectorCombine::foldExtractExtract(Instruction &I) { return false; Instruction *I0, *I1; - CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) && !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1)))) return false; @@ -665,9 +676,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) return false; - // TODO: We could handle this with a length-changing shuffle. auto *VecTy = cast<FixedVectorType>(I.getType()); - if (SrcVec->getType() != VecTy) + auto *ScalarTy = VecTy->getScalarType(); + auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType()); + if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) return false; // Ignore bogus insert/extract index. @@ -681,11 +693,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { SmallVector<int> Mask(NumElts); std::iota(Mask.begin(), Mask.end(), 0); Mask[Index] = Index + NumElts; - - Type *ScalarTy = VecTy->getScalarType(); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost OldCost = - TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + + TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); // If the extract has one use, it will be eliminated, so count it in the @@ -695,17 +704,36 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); InstructionCost NewCost = - TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + - TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); + TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + + TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); + + bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; + // If the lengths of the two vectors are not equal, + // we need to add a length-change vector. Add this cost. + SmallVector<int> SrcMask; + if (NeedLenChg) { + SrcMask.assign(NumElts, PoisonMaskElem); + SrcMask[Index] = Index; + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + SrcVecTy, SrcMask, CostKind); + } if (NewCost > OldCost) return false; - // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> - // shuffle DestVec, (fneg SrcVec), Mask + Value *NewShuf; + // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); - Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); - replaceValue(I, *Shuf); + if (NeedLenChg) { + // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask + Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask); + NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); + } else { + // shuffle DestVec, (fneg SrcVec), Mask + NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + } + + replaceValue(I, *NewShuf); return true; } @@ -772,22 +800,25 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) { unsigned NumOps = IsUnary ? 1 : 2; // The new shuffle must not cost more than the old shuffle. - TargetTransformInfo::TargetCostKind CK = - TargetTransformInfo::TCK_RecipThroughput; TargetTransformInfo::ShuffleKind SK = IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc : TargetTransformInfo::SK_PermuteTwoSrc; - InstructionCost DestCost = - TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) + + InstructionCost NewCost = + TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CostKind) + (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy, TargetTransformInfo::CastContextHint::None, - CK)); - InstructionCost SrcCost = - TTI.getShuffleCost(SK, SrcTy, Mask, CK) + + CostKind)); + InstructionCost OldCost = + TTI.getShuffleCost(SK, SrcTy, Mask, CostKind) + TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy, - TargetTransformInfo::CastContextHint::None, CK); - if (DestCost > SrcCost || !DestCost.isValid()) + TargetTransformInfo::CastContextHint::None, + CostKind); + + LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: " + << OldCost << " vs NewCost: " << NewCost << "\n"); + + if (NewCost > OldCost || !NewCost.isValid()) return false; // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC' @@ -841,13 +872,13 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { // Calculate cost of splatting both operands into vectors and the vector // intrinsic VectorType *VecTy = cast<VectorType>(VPI.getType()); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; SmallVector<int> Mask; if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy)) Mask.resize(FVTy->getNumElements(), 0); InstructionCost SplatCost = TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask); + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask, + CostKind); // Calculate the cost of the VP Intrinsic SmallVector<Type *, 4> Args; @@ -873,8 +904,8 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args); ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); } else { - ScalarOpCost = - TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType()); + ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode, + VecTy->getScalarType(), CostKind); } // The existing splats may be kept around if other instructions use them. @@ -924,7 +955,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { /// Match a vector binop or compare instruction with at least one inserted /// scalar operand and convert to scalar binop/cmp followed by insertelement. bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { - CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; Value *Ins0, *Ins1; if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) @@ -993,17 +1024,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { if (IsCmp) { CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); ScalarOpCost = TTI.getCmpSelInstrCost( - Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); + Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind); VectorOpCost = TTI.getCmpSelInstrCost( - Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); + Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind); } else { - ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); - VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); + ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind); + VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind); } // Get cost estimate for the insert element. This cost will factor into // both sequences. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost InsertCost = TTI.getVectorInstrCost( Instruction::InsertElement, VecTy, CostKind, Index); InstructionCost OldCost = @@ -1065,9 +1095,11 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); Instruction *I0, *I1; Constant *C0, *C1; - CmpInst::Predicate P0, P1; + CmpPredicate P0, P1; + // FIXME: Use CmpPredicate::getMatching here. if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) || - !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) || P0 != P1) + !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) || + P0 != static_cast<CmpInst::Predicate>(P1)) return false; // The compare operands must be extracts of the same vector with constant @@ -1080,7 +1112,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { auto *Ext0 = cast<ExtractElementInst>(I0); auto *Ext1 = cast<ExtractElementInst>(I1); - ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1); + ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind); if (!ConvertToShuf) return false; assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) && @@ -1089,23 +1121,23 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { // The original scalar pattern is: // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) CmpInst::Predicate Pred = P0; - unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp - : Instruction::ICmp; + unsigned CmpOpcode = + CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp; auto *VecTy = dyn_cast<FixedVectorType>(X->getType()); if (!VecTy) return false; - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Ext0Cost = - TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0), - Ext1Cost = - TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); + InstructionCost Ext1Cost = + TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); + InstructionCost CmpCost = TTI.getCmpSelInstrCost( + CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred, + CostKind); + InstructionCost OldCost = - Ext0Cost + Ext1Cost + - TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), - CmpInst::makeCmpResultType(I0->getType()), Pred) * - 2 + - TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); + Ext0Cost + Ext1Cost + CmpCost * 2 + + TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind); // The proposed vector pattern is: // vcmp = cmp Pred X, VecC @@ -1114,12 +1146,13 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); InstructionCost NewCost = TTI.getCmpSelInstrCost( - CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred); + CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred, + CostKind); SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); ShufMask[CheapIndex] = ExpensiveIndex; NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, - ShufMask); - NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); + ShufMask, CostKind); + NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind); NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost; NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost; @@ -1311,6 +1344,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) { MemoryLocation::get(SI), AA)) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(Load); + if (ScalarizableIdx.isSafeWithFreeze()) ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx)); Value *GEP = Builder.CreateInBoundsGEP( @@ -1336,14 +1373,14 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (!match(&I, m_Load(m_Value(Ptr)))) return false; - auto *VecTy = cast<VectorType>(I.getType()); auto *LI = cast<LoadInst>(&I); + auto *VecTy = cast<VectorType>(LI->getType()); if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType())) return false; InstructionCost OriginalCost = TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(), - LI->getPointerAddressSpace()); + LI->getPointerAddressSpace(), CostKind); InstructionCost ScalarizedCost = 0; Instruction *LastCheckedInst = LI; @@ -1377,7 +1414,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { LastCheckedInst = UI; } - auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT); + auto ScalarIdx = + canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT); if (ScalarIdx.isUnsafe()) return false; if (ScalarIdx.isSafeWithFreeze()) { @@ -1385,24 +1423,27 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { ScalarIdx.discard(); } - auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1)); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand()); OriginalCost += TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, Index ? Index->getZExtValue() : -1); ScalarizedCost += TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(), - Align(1), LI->getPointerAddressSpace()); + Align(1), LI->getPointerAddressSpace(), CostKind); ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType()); } if (ScalarizedCost >= OriginalCost) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(LI); + // Replace extracts with narrow scalar loads. for (User *U : LI->users()) { auto *EI = cast<ExtractElementInst>(U); - Value *Idx = EI->getOperand(1); + Value *Idx = EI->getIndexOperand(); // Insert 'freeze' for poison indexes. auto It = NeedFreeze.find(EI); @@ -1426,6 +1467,117 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { return true; } +/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))" +/// to "(bitcast (concat X, Y))" +/// where X/Y are bitcasted from i1 mask vectors. +bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) { + Type *Ty = I.getType(); + if (!Ty->isIntegerTy()) + return false; + + // TODO: Add big endian test coverage + if (DL->isBigEndian()) + return false; + + // Restrict to disjoint cases so the mask vectors aren't overlapping. + Instruction *X, *Y; + if (!match(&I, m_DisjointOr(m_Instruction(X), m_Instruction(Y)))) + return false; + + // Allow both sources to contain shl, to handle more generic pattern: + // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))" + Value *SrcX; + uint64_t ShAmtX = 0; + if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) && + !match(X, m_OneUse( + m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX))))), + m_ConstantInt(ShAmtX))))) + return false; + + Value *SrcY; + uint64_t ShAmtY = 0; + if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) && + !match(Y, m_OneUse( + m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY))))), + m_ConstantInt(ShAmtY))))) + return false; + + // Canonicalize larger shift to the RHS. + if (ShAmtX > ShAmtY) { + std::swap(X, Y); + std::swap(SrcX, SrcY); + std::swap(ShAmtX, ShAmtY); + } + + // Ensure both sources are matching vXi1 bool mask types, and that the shift + // difference is the mask width so they can be easily concatenated together. + uint64_t ShAmtDiff = ShAmtY - ShAmtX; + unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0); + unsigned BitWidth = Ty->getPrimitiveSizeInBits(); + auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType()); + if (!MaskTy || SrcX->getType() != SrcY->getType() || + !MaskTy->getElementType()->isIntegerTy(1) || + MaskTy->getNumElements() != ShAmtDiff || + MaskTy->getNumElements() > (BitWidth / 2)) + return false; + + auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy); + auto *ConcatIntTy = + Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements()); + auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff); + + SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements()); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + + // TODO: Is it worth supporting multi use cases? + InstructionCost OldCost = 0; + OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind); + OldCost += + NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind); + OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy, + TTI::CastContextHint::None, CostKind); + OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy, + TTI::CastContextHint::None, CostKind); + + InstructionCost NewCost = 0; + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, MaskTy, + ConcatMask, CostKind); + NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy, + TTI::CastContextHint::None, CostKind); + if (Ty != ConcatIntTy) + NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy, + TTI::CastContextHint::None, CostKind); + if (ShAmtX > 0) + NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind); + + LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I + << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); + + if (NewCost > OldCost) + return false; + + // Build bool mask concatenation, bitcast back to scalar integer, and perform + // any residual zero-extension or shifting. + Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask); + Worklist.pushValue(Concat); + + Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy); + + if (Ty != ConcatIntTy) { + Worklist.pushValue(Result); + Result = Builder.CreateZExt(Result, Ty); + } + + if (ShAmtX > 0) { + Worklist.pushValue(Result); + Result = Builder.CreateShl(Result, ShAmtX); + } + + replaceValue(I, *Result); + return true; +} + /// Try to convert "shuffle (binop (shuffle, shuffle)), undef" /// --> "binop (shuffle), (shuffle)". bool VectorCombine::foldPermuteOfBinops(Instruction &I) { @@ -1480,8 +1632,6 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) { } // Try to merge shuffles across the binop if the new shuffles are not costly. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost OldCost = TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy, @@ -1523,34 +1673,46 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) { } /// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)". +/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)". bool VectorCombine::foldShuffleOfBinops(Instruction &I) { - BinaryOperator *B0, *B1; ArrayRef<int> OldMask; - if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)), - m_Mask(OldMask)))) - return false; - - // Don't introduce poison into div/rem. - if (llvm::is_contained(OldMask, PoisonMaskElem) && B0->isIntDivRem()) + Instruction *LHS, *RHS; + if (!match(&I, m_Shuffle(m_OneUse(m_Instruction(LHS)), + m_OneUse(m_Instruction(RHS)), m_Mask(OldMask)))) return false; // TODO: Add support for addlike etc. - Instruction::BinaryOps Opcode = B0->getOpcode(); - if (Opcode != B1->getOpcode()) + if (LHS->getOpcode() != RHS->getOpcode()) + return false; + + Value *X, *Y, *Z, *W; + bool IsCommutative = false; + CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE; + CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE; + if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) && + match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) { + auto *BO = cast<BinaryOperator>(LHS); + // Don't introduce poison into div/rem. + if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem()) + return false; + IsCommutative = BinaryOperator::isCommutative(BO->getOpcode()); + } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) && + match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) && + (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) { + IsCommutative = cast<CmpInst>(LHS)->isCommutative(); + } else return false; auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); - auto *BinOpTy = dyn_cast<FixedVectorType>(B0->getType()); - if (!ShuffleDstTy || !BinOpTy) + auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType()); + auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType()); + if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType()) return false; unsigned NumSrcElts = BinOpTy->getNumElements(); // If we have something like "add X, Y" and "add Z, X", swap ops to match. - Value *X = B0->getOperand(0), *Y = B0->getOperand(1); - Value *Z = B1->getOperand(0), *W = B1->getOperand(1); - if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W && - (X == W || Y == Z)) + if (IsCommutative && X != Z && Y != W && (X == W || Y == Z)) std::swap(X, Y); auto ConvertToUnary = [NumSrcElts](int &M) { @@ -1575,33 +1737,48 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { } // Try to replace a binop with a shuffle if the shuffle is not costly. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost OldCost = - TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy, CostKind) + - TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, - OldMask, CostKind, 0, nullptr, {B0, B1}, &I); + TTI.getInstructionCost(LHS, CostKind) + + TTI.getInstructionCost(RHS, CostKind) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy, + OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I); InstructionCost NewCost = TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + - TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) + - TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind); + TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}); + + if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) { + NewCost += + TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind); + } else { + auto *ShuffleCmpTy = + FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy); + NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, + ShuffleDstTy, PredLHS, CostKind); + } LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost << "\n"); - if (NewCost >= OldCost) + + // If either shuffle will constant fold away, then fold for the same cost as + // we will reduce the instruction count. + bool ReducedInstCount = (isa<Constant>(X) && isa<Constant>(Z)) || + (isa<Constant>(Y) && isa<Constant>(W)); + if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost)) return false; Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0); Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1); - Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); + Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE + ? Builder.CreateBinOp( + cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1) + : Builder.CreateCmp(PredLHS, Shuf0, Shuf1); // Intersect flags from the old binops. if (auto *NewInst = dyn_cast<Instruction>(NewBO)) { - NewInst->copyIRFlags(B0); - NewInst->andIRFlags(B1); + NewInst->copyIRFlags(LHS); + NewInst->andIRFlags(RHS); } Worklist.pushValue(Shuf0); @@ -1672,8 +1849,6 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size()); // Try to replace a castop with a shuffle if the shuffle is not costly. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost CostC0 = TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy, TTI::CastContextHint::None, CostKind); @@ -1715,77 +1890,123 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { return true; } -/// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)" +/// Try to convert any of: +/// "shuffle (shuffle x, y), (shuffle y, x)" +/// "shuffle (shuffle x, undef), (shuffle y, undef)" +/// "shuffle (shuffle x, undef), y" +/// "shuffle x, (shuffle y, undef)" /// into "shuffle x, y". bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { - Value *V0, *V1; - UndefValue *U0, *U1; - ArrayRef<int> OuterMask, InnerMask0, InnerMask1; + ArrayRef<int> OuterMask; + Value *OuterV0, *OuterV1; if (!match(&I, - m_Shuffle( - m_Shuffle(m_Value(V0), m_UndefValue(U0), m_Mask(InnerMask0)), - m_Shuffle(m_Value(V1), m_UndefValue(U1), m_Mask(InnerMask1)), - m_Mask(OuterMask)))) + m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask)))) + return false; + + ArrayRef<int> InnerMask0, InnerMask1; + Value *X0, *X1, *Y0, *Y1; + bool Match0 = + match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0))); + bool Match1 = + match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1))); + if (!Match0 && !Match1) return false; - auto *ShufI0 = dyn_cast<Instruction>(I.getOperand(0)); - auto *ShufI1 = dyn_cast<Instruction>(I.getOperand(1)); + X0 = Match0 ? X0 : OuterV0; + Y0 = Match0 ? Y0 : OuterV0; + X1 = Match1 ? X1 : OuterV1; + Y1 = Match1 ? Y1 : OuterV1; auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); - auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType()); - auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType()); + auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType()); + auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType()); if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy || - V0->getType() != V1->getType()) + X0->getType() != X1->getType()) return false; unsigned NumSrcElts = ShuffleSrcTy->getNumElements(); unsigned NumImmElts = ShuffleImmTy->getNumElements(); - // Bail if either inner masks reference a RHS undef arg. - if ((!isa<PoisonValue>(U0) && - any_of(InnerMask0, [&](int M) { return M >= (int)NumSrcElts; })) || - (!isa<PoisonValue>(U1) && - any_of(InnerMask1, [&](int M) { return M >= (int)NumSrcElts; }))) - return false; - - // Merge shuffles - replace index to the RHS poison arg with PoisonMaskElem, + // Attempt to merge shuffles, matching upto 2 source operands. + // Replace index to a poison arg with PoisonMaskElem. + // Bail if either inner masks reference an undef arg. SmallVector<int, 16> NewMask(OuterMask); + Value *NewX = nullptr, *NewY = nullptr; for (int &M : NewMask) { + Value *Src = nullptr; if (0 <= M && M < (int)NumImmElts) { - M = (InnerMask0[M] >= (int)NumSrcElts) ? PoisonMaskElem : InnerMask0[M]; + Src = OuterV0; + if (Match0) { + M = InnerMask0[M]; + Src = M >= (int)NumSrcElts ? Y0 : X0; + M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M; + } } else if (M >= (int)NumImmElts) { - if (InnerMask1[M - NumImmElts] >= (int)NumSrcElts) + Src = OuterV1; + M -= NumImmElts; + if (Match1) { + M = InnerMask1[M]; + Src = M >= (int)NumSrcElts ? Y1 : X1; + M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M; + } + } + if (Src && M != PoisonMaskElem) { + assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index"); + if (isa<UndefValue>(Src)) { + // We've referenced an undef element - if its poison, update the shuffle + // mask, else bail. + if (!isa<PoisonValue>(Src)) + return false; M = PoisonMaskElem; - else - M = InnerMask1[M - NumImmElts] + (V0 == V1 ? 0 : NumSrcElts); + continue; + } + if (!NewX || NewX == Src) { + NewX = Src; + continue; + } + if (!NewY || NewY == Src) { + M += NumSrcElts; + NewY = Src; + continue; + } + return false; } } + if (!NewX) + return PoisonValue::get(ShuffleDstTy); + if (!NewY) + NewY = PoisonValue::get(ShuffleSrcTy); + // Have we folded to an Identity shuffle? if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) { - replaceValue(I, *V0); + replaceValue(I, *NewX); return true; } // Try to merge the shuffles if the new shuffle is not costly. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - - InstructionCost InnerCost0 = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy, - InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0); - InstructionCost InnerCost1 = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy, - InnerMask1, CostKind, 0, nullptr, {V1, U1}, ShufI1); - InstructionCost OuterCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, - OuterMask, CostKind, 0, nullptr, {ShufI0, ShufI1}, &I); + InstructionCost InnerCost0 = 0; + if (Match0) + InnerCost0 = TTI.getInstructionCost(cast<Instruction>(OuterV0), CostKind); + + InstructionCost InnerCost1 = 0; + if (Match1) + InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind); + + InstructionCost OuterCost = TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, OuterMask, CostKind, + 0, nullptr, {OuterV0, OuterV1}, &I); + InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost; - InstructionCost NewCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy, - NewMask, CostKind, 0, nullptr, {V0, V1}); - if (!ShufI0->hasOneUse()) + bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; }); + TargetTransformInfo::ShuffleKind SK = + IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc + : TargetTransformInfo::SK_PermuteTwoSrc; + InstructionCost NewCost = TTI.getShuffleCost( + SK, ShuffleSrcTy, NewMask, CostKind, 0, nullptr, {NewX, NewY}); + if (!OuterV0->hasOneUse()) NewCost += InnerCost0; - if (!ShufI1->hasOneUse()) + if (!OuterV1->hasOneUse()) NewCost += InnerCost1; LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I @@ -1794,13 +2015,7 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { if (NewCost > OldCost) return false; - // Clear unused sources to poison. - if (none_of(NewMask, [&](int M) { return 0 <= M && M < (int)NumSrcElts; })) - V0 = PoisonValue::get(ShuffleSrcTy); - if (none_of(NewMask, [&](int M) { return (int)NumSrcElts <= M; })) - V1 = PoisonValue::get(ShuffleSrcTy); - - Value *Shuf = Builder.CreateShuffleVector(V0, V1, NewMask); + Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask); replaceValue(I, *Shuf); return true; } @@ -1832,32 +2047,30 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) { return false; for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) - if (isVectorIntrinsicWithScalarOpAtArg(IID, I) && + if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI) && II0->getArgOperand(I) != II1->getArgOperand(I)) return false; InstructionCost OldCost = - TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), - TTI::TCK_RecipThroughput) + - TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), - TTI::TCK_RecipThroughput) + + TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) + + TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask, - TTI::TCK_RecipThroughput, 0, nullptr, {II0, II1}, &I); + CostKind, 0, nullptr, {II0, II1}, &I); SmallVector<Type *> NewArgsTy; InstructionCost NewCost = 0; for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) - if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) { + if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) { NewArgsTy.push_back(II0->getArgOperand(I)->getType()); } else { auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType()); NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(), VecTy->getNumElements() * 2)); NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - VecTy, OldMask, TTI::TCK_RecipThroughput); + VecTy, OldMask, CostKind); } IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy); - NewCost += TTI.getIntrinsicInstrCost(NewAttr, TTI::TCK_RecipThroughput); + NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind); LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost @@ -1868,7 +2081,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) { SmallVector<Value *> NewArgs; for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) - if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) { + if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) { NewArgs.push_back(II0->getArgOperand(I)); } else { Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), @@ -1923,7 +2136,7 @@ generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) { } /// Detect concat of multiple values into a vector -static bool isFreeConcat(ArrayRef<InstLane> Item, +static bool isFreeConcat(ArrayRef<InstLane> Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI) { auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType()); unsigned NumElts = Ty->getNumElements(); @@ -1934,8 +2147,7 @@ static bool isFreeConcat(ArrayRef<InstLane> Item, // during legalization. SmallVector<int, 16> ConcatMask(NumElts * 2); std::iota(ConcatMask.begin(), ConcatMask.end(), 0); - if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, - TTI::TCK_RecipThroughput) != 0) + if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, CostKind) != 0) return false; unsigned NumSlices = Item.size() / NumElts; @@ -1960,7 +2172,8 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty, const SmallPtrSet<Use *, 4> &IdentityLeafs, const SmallPtrSet<Use *, 4> &SplatLeafs, const SmallPtrSet<Use *, 4> &ConcatLeafs, - IRBuilder<> &Builder) { + IRBuilder<> &Builder, + const TargetTransformInfo *TTI) { auto [FrontU, FrontLane] = Item.front(); if (IdentityLeafs.contains(FrontU)) { @@ -1995,13 +2208,14 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty, unsigned NumOps = I->getNumOperands() - (II ? 1 : 0); SmallVector<Value *> Ops(NumOps); for (unsigned Idx = 0; Idx < NumOps; Idx++) { - if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) { + if (II && + isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) { Ops[Idx] = II->getOperand(Idx); continue; } - Ops[Idx] = - generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty, - IdentityLeafs, SplatLeafs, ConcatLeafs, Builder); + Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), + Ty, IdentityLeafs, SplatLeafs, ConcatLeafs, + Builder, TTI); } SmallVector<Value *, 8> ValueList; @@ -2097,7 +2311,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { all_of(drop_begin(Item), [Item](InstLane &IL) { Value *FrontV = Item.front().first->get(); Use *U = IL.first; - return !U || U->get() == FrontV; + return !U || (isa<Constant>(U->get()) && + cast<Constant>(U->get())->getSplatValue() == + cast<Constant>(FrontV)->getSplatValue()); })) { SplatLeafs.insert(FrontU); continue; @@ -2127,7 +2343,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate()) return false; if (auto *CI = dyn_cast<CastInst>(V)) - if (CI->getSrcTy() != cast<CastInst>(FrontV)->getSrcTy()) + if (CI->getSrcTy()->getScalarType() != + cast<CastInst>(FrontV)->getSrcTy()->getScalarType()) return false; if (auto *SI = dyn_cast<SelectInst>(V)) if (!isa<VectorType>(SI->getOperand(0)->getType()) || @@ -2152,7 +2369,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); continue; - } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontU)) { + } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst, + FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) { Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); continue; } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) { @@ -2173,7 +2391,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { II && isTriviallyVectorizable(II->getIntrinsicID()) && !II->hasOperandBundles()) { for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) { - if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) { + if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op, + &TTI)) { if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) { Value *FrontV = Item.front().first->get(); Use *U = IL.first; @@ -2189,7 +2408,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { } } - if (isFreeConcat(Item, TTI)) { + if (isFreeConcat(Item, CostKind, TTI)) { ConcatLeafs.insert(FrontU); continue; } @@ -2200,11 +2419,13 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { if (NumVisited <= 1) return false; + LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n"); + // If we got this far, we know the shuffles are superfluous and can be // removed. Scan through again and generate the new tree of instructions. Builder.SetInsertPoint(&I); Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, - ConcatLeafs, Builder); + ConcatLeafs, Builder, &TTI); replaceValue(I, *V); return true; } @@ -2306,10 +2527,10 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType; InstructionCost OldCost = TTI.getShuffleCost( UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, - VecTyForCost, Shuffle->getShuffleMask()); + VecTyForCost, Shuffle->getShuffleMask(), CostKind); InstructionCost NewCost = TTI.getShuffleCost( UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, - VecTyForCost, ConcatMask); + VecTyForCost, ConcatMask, CostKind); LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle << "\n"); @@ -2367,7 +2588,6 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) { auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType()); Type *ResultTy = I.getType(); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost OldCost = TTI.getArithmeticReductionCost( ReductionOpc, ReductionSrcTy, std::nullopt, CostKind); OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy, @@ -2624,17 +2844,17 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, - VT, SV->getShuffleMask()); + VT, SV->getShuffleMask(), CostKind); }; auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { - return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); + return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask, CostKind); }; // Get the costs of the shuffles + binops before and after with the new // shuffle masks. InstructionCost CostBefore = - TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) + - TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); + TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) + + TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind); CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), InstructionCost(0), AddShuffleCost); CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), @@ -2647,8 +2867,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { FixedVectorType *Op1SmallVT = FixedVectorType::get(VT->getScalarType(), V2.size()); InstructionCost CostAfter = - TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) + - TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT); + TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) + + TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind); CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), InstructionCost(0), AddShuffleMaskCost); std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); @@ -2717,7 +2937,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { /// lshr((zext(x),y) -> zext(lshr(x,trunc(y))) /// Cost model calculations takes into account if zext(x) has other users and /// whether it can be propagated through them too. -bool VectorCombine::shrinkType(llvm::Instruction &I) { +bool VectorCombine::shrinkType(Instruction &I) { Value *ZExted, *OtherOperand; if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))) && @@ -2746,7 +2966,6 @@ bool VectorCombine::shrinkType(llvm::Instruction &I) { // Calculate costs of leaving current IR as it is and moving ZExt operation // later, along with adding truncates if needed - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost ZExtCost = TTI.getCastInstrCost( Instruction::ZExt, BigTy, SmallTy, TargetTransformInfo::CastContextHint::None, CostKind); @@ -2826,26 +3045,46 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { if (ExtIdx >= NumElts || InsIdx >= NumElts) return false; - SmallVector<int> Mask(NumElts, 0); - std::iota(Mask.begin(), Mask.end(), 0); - Mask[InsIdx] = ExtIdx + NumElts; + // Insertion into poison is a cheaper single operand shuffle. + TargetTransformInfo::ShuffleKind SK; + SmallVector<int> Mask(NumElts, PoisonMaskElem); + if (isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec)) { + SK = TargetTransformInfo::SK_PermuteSingleSrc; + Mask[InsIdx] = ExtIdx; + std::swap(DstVec, SrcVec); + } else { + SK = TargetTransformInfo::SK_PermuteTwoSrc; + std::iota(Mask.begin(), Mask.end(), 0); + Mask[InsIdx] = ExtIdx + NumElts; + } + // Cost auto *Ins = cast<InsertElementInst>(&I); auto *Ext = cast<ExtractElementInst>(I.getOperand(1)); - - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost OldCost = - TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx) + + InstructionCost InsCost = TTI.getVectorInstrCost(*Ins, VecTy, CostKind, InsIdx); + InstructionCost ExtCost = + TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx); + InstructionCost OldCost = ExtCost + InsCost; - InstructionCost NewCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask); + InstructionCost NewCost = TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0, + nullptr, {DstVec, SrcVec}); if (!Ext->hasOneUse()) - NewCost += TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx); + NewCost += ExtCost; + + LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair : " << I + << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); if (OldCost < NewCost) return false; + // Canonicalize undef param to RHS to help further folds. + if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) { + ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); + std::swap(DstVec, SrcVec); + } + Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask); replaceValue(I, *Shuf); @@ -2862,12 +3101,17 @@ bool VectorCombine::run() { if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) return false; + LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n"); + bool MadeChange = false; auto FoldInst = [this, &MadeChange](Instruction &I) { Builder.SetInsertPoint(&I); + bool IsVectorType = isa<VectorType>(I.getType()); bool IsFixedVectorType = isa<FixedVectorType>(I.getType()); auto Opcode = I.getOpcode(); + LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n'); + // These folds should be beneficial regardless of when this pass is run // in the optimization pipeline. // The type checking is for run-time efficiency. We can avoid wasting time @@ -2887,7 +3131,7 @@ bool VectorCombine::run() { // This transform works with scalable and fixed vectors // TODO: Identify and allow other scalable transforms - if (isa<VectorType>(I.getType())) { + if (IsVectorType) { MadeChange |= scalarizeBinopOrCmp(I); MadeChange |= scalarizeLoadExtract(I); MadeChange |= scalarizeVPIntrinsic(I); @@ -2936,6 +3180,9 @@ bool VectorCombine::run() { case Instruction::FCmp: MadeChange |= foldExtractExtract(I); break; + case Instruction::Or: + MadeChange |= foldConcatOfBoolMasks(I); + [[fallthrough]]; default: if (Instruction::isBinaryOp(Opcode)) { MadeChange |= foldExtractExtract(I); @@ -2981,7 +3228,8 @@ PreservedAnalyses VectorCombinePass::run(Function &F, DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); AAResults &AA = FAM.getResult<AAManager>(F); const DataLayout *DL = &F.getDataLayout(); - VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly); + VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput, + TryEarlyFoldsOnly); if (!Combiner.run()) return PreservedAnalyses::all(); PreservedAnalyses PA; |
