diff options
| author | Mingming Liu <mingmingl@google.com> | 2025-09-10 15:25:31 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-10 15:25:31 -0700 |
| commit | 1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch) | |
| tree | 57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/lib/CodeGen | |
| parent | 898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff) | |
| parent | b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff) | |
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/lib/CodeGen')
41 files changed, 1755 insertions, 450 deletions
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 23a3543e9ebe..cd14a4f57f76 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1432,7 +1432,7 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { MCSection *BBAddrMapSection = getObjFileLowering().getBBAddrMapSection(*MF.getSection()); assert(BBAddrMapSection && ".llvm_bb_addr_map section is not initialized."); - bool HasCalls = !CurrentFnCallsiteSymbols.empty(); + bool HasCalls = !CurrentFnCallsiteEndSymbols.empty(); const MCSymbol *FunctionSymbol = getFunctionBegin(); @@ -1497,13 +1497,13 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { emitLabelDifferenceAsULEB128(MBBSymbol, PrevMBBEndSymbol); const MCSymbol *CurrentLabel = MBBSymbol; if (HasCalls) { - auto CallsiteSymbols = CurrentFnCallsiteSymbols.lookup(&MBB); + auto CallsiteEndSymbols = CurrentFnCallsiteEndSymbols.lookup(&MBB); OutStreamer->AddComment("number of callsites"); - OutStreamer->emitULEB128IntValue(CallsiteSymbols.size()); - for (const MCSymbol *CallsiteSymbol : CallsiteSymbols) { + OutStreamer->emitULEB128IntValue(CallsiteEndSymbols.size()); + for (const MCSymbol *CallsiteEndSymbol : CallsiteEndSymbols) { // Emit the callsite offset. - emitLabelDifferenceAsULEB128(CallsiteSymbol, CurrentLabel); - CurrentLabel = CallsiteSymbol; + emitLabelDifferenceAsULEB128(CallsiteEndSymbol, CurrentLabel); + CurrentLabel = CallsiteEndSymbol; } } // Emit the offset to the end of the block, which can be used to compute @@ -1941,8 +1941,6 @@ void AsmPrinter::emitFunctionBody() { !MI.isDebugInstr()) { HasAnyRealCode = true; } - if (MI.isCall() && MF->getTarget().Options.BBAddrMap) - OutStreamer->emitLabel(createCallsiteSymbol(MBB)); // If there is a pre-instruction symbol, emit a label for it here. if (MCSymbol *S = MI.getPreInstrSymbol()) @@ -2064,6 +2062,9 @@ void AsmPrinter::emitFunctionBody() { break; } + if (MI.isCall() && MF->getTarget().Options.BBAddrMap) + OutStreamer->emitLabel(createCallsiteEndSymbol(MBB)); + if (TM.Options.EmitCallGraphSection && MI.isCall()) emitIndirectCalleeLabels(FuncInfo, CallSitesInfoMap, MI); @@ -2897,11 +2898,11 @@ MCSymbol *AsmPrinter::getMBBExceptionSym(const MachineBasicBlock &MBB) { return Res.first->second; } -MCSymbol *AsmPrinter::createCallsiteSymbol(const MachineBasicBlock &MBB) { +MCSymbol *AsmPrinter::createCallsiteEndSymbol(const MachineBasicBlock &MBB) { MCContext &Ctx = MF->getContext(); MCSymbol *Sym = Ctx.createTempSymbol("BB" + Twine(MF->getFunctionNumber()) + "_" + Twine(MBB.getNumber()) + "_CS"); - CurrentFnCallsiteSymbols[&MBB].push_back(Sym); + CurrentFnCallsiteEndSymbols[&MBB].push_back(Sym); return Sym; } @@ -2939,7 +2940,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { CurrentFnBegin = nullptr; CurrentFnBeginLocal = nullptr; CurrentSectionBeginSym = nullptr; - CurrentFnCallsiteSymbols.clear(); + CurrentFnCallsiteEndSymbols.clear(); MBBSectionRanges.clear(); MBBSectionExceptionSyms.clear(); bool NeedsLocalForSize = MAI->needsLocalForSize(); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index c27f10077562..2090157a1a91 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -3111,8 +3111,10 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, &AP](const DbgValueLocEntry &Entry, DIExpressionCursor &Cursor) -> bool { if (Entry.isInt()) { - if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed || - BT->getEncoding() == dwarf::DW_ATE_signed_char)) + if (BT && (BT->getEncoding() == dwarf::DW_ATE_boolean)) + DwarfExpr.addBooleanConstant(Entry.getInt()); + else if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed || + BT->getEncoding() == dwarf::DW_ATE_signed_char)) DwarfExpr.addSignedConstant(Entry.getInt()); else DwarfExpr.addUnsignedConstant(Entry.getInt()); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index e684054ffa3e..8a30714db2fd 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -194,6 +194,15 @@ void DwarfExpression::addStackValue() { emitOp(dwarf::DW_OP_stack_value); } +void DwarfExpression::addBooleanConstant(int64_t Value) { + assert(isImplicitLocation() || isUnknownLocation()); + LocationKind = Implicit; + if (Value == 0) + emitOp(dwarf::DW_OP_lit0); + else + emitOp(dwarf::DW_OP_lit1); +} + void DwarfExpression::addSignedConstant(int64_t Value) { assert(isImplicitLocation() || isUnknownLocation()); LocationKind = Implicit; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h index 06809ab26387..700e0ec5813e 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -229,6 +229,9 @@ public: /// This needs to be called last to commit any pending changes. void finalize(); + /// Emit a boolean constant. + void addBooleanConstant(int64_t Value); + /// Emit a signed constant. void addSignedConstant(int64_t Value); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index b03fac2d22a5..d76fd0c01020 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1351,6 +1351,13 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) { ContextDIE = &getUnitDie(); // Build the decl now to ensure it precedes the definition. getOrCreateSubprogramDIE(SPDecl); + // Check whether the DIE for SP has already been created after the call + // above. + // FIXME: Should the creation of definition subprogram DIE during + // the creation of declaration subprogram DIE be allowed? + // See https://github.com/llvm/llvm-project/pull/154636. + if (DIE *SPDie = getDIE(SP)) + return SPDie; } } @@ -1403,11 +1410,8 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP, // Add the linkage name if we have one and it isn't in the Decl. StringRef LinkageName = SP->getLinkageName(); - assert(((LinkageName.empty() || DeclLinkageName.empty()) || - LinkageName == DeclLinkageName) && - "decl has a linkage name and it is different"); - if (DeclLinkageName.empty() && - // Always emit it for abstract subprograms. + // Always emit linkage name for abstract subprograms. + if (DeclLinkageName != LinkageName && (DD->useAllLinkageNames() || DU->getAbstractScopeDIEs().lookup(SP))) addLinkageName(SPDie, LinkageName); diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 278dd6560e73..4931403ab83a 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -84,7 +84,7 @@ private: bool expandAtomicLoadToCmpXchg(LoadInst *LI); StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); bool tryExpandAtomicStore(StoreInst *SI); - void expandAtomicStore(StoreInst *SI); + void expandAtomicStoreToXChg(StoreInst *SI); bool tryExpandAtomicRMW(AtomicRMWInst *AI); AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI); Value * @@ -537,6 +537,9 @@ bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) { case TargetLoweringBase::AtomicExpansionKind::NotAtomic: LI->setAtomic(AtomicOrdering::NotAtomic); return true; + case TargetLoweringBase::AtomicExpansionKind::CustomExpand: + TLI->emitExpandAtomicLoad(LI); + return true; default: llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); } @@ -546,8 +549,11 @@ bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) { switch (TLI->shouldExpandAtomicStoreInIR(SI)) { case TargetLoweringBase::AtomicExpansionKind::None: return false; + case TargetLoweringBase::AtomicExpansionKind::CustomExpand: + TLI->emitExpandAtomicStore(SI); + return true; case TargetLoweringBase::AtomicExpansionKind::Expand: - expandAtomicStore(SI); + expandAtomicStoreToXChg(SI); return true; case TargetLoweringBase::AtomicExpansionKind::NotAtomic: SI->setAtomic(AtomicOrdering::NotAtomic); @@ -620,7 +626,7 @@ StoreInst *AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst *SI) { return NewSI; } -void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) { +void AtomicExpandImpl::expandAtomicStoreToXChg(StoreInst *SI) { // This function is only called on atomic stores that are too large to be // atomic if implemented as a native store. So we replace them by an // atomic swap, that can be implemented for example as a ldrex/strex on ARM @@ -741,7 +747,7 @@ bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) { } case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicRMWInst(AI); - case TargetLoweringBase::AtomicExpansionKind::Expand: + case TargetLoweringBase::AtomicExpansionKind::CustomExpand: TLI->emitExpandAtomicRMW(AI); return true; default: @@ -1454,7 +1460,8 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // If the cmpxchg doesn't actually need any ordering when it fails, we can // jump straight past that fence instruction (if it exists). - Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); + Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB, + MDBuilder(F->getContext()).createLikelyBranchWeights()); Builder.SetInsertPoint(ReleasingStoreBB); if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) @@ -1473,7 +1480,8 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; Builder.CreateCondBr(StoreSuccess, SuccessBB, - CI->isWeak() ? FailureBB : RetryBB); + CI->isWeak() ? FailureBB : RetryBB, + MDBuilder(F->getContext()).createLikelyBranchWeights()); Builder.SetInsertPoint(ReleasedLoadBB); Value *SecondLoad; @@ -1486,7 +1494,9 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // If the cmpxchg doesn't actually need any ordering when it fails, we can // jump straight past that fence instruction (if it exists). - Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); + Builder.CreateCondBr( + ShouldStore, TryStoreBB, NoStoreBB, + MDBuilder(F->getContext()).createLikelyBranchWeights()); // Update PHI node in TryStoreBB. LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB); } else @@ -1695,7 +1705,7 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) { return true; case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicCmpXchgInst(CI); - case TargetLoweringBase::AtomicExpansionKind::Expand: { + case TargetLoweringBase::AtomicExpansionKind::CustomExpand: { TLI->emitExpandAtomicCmpXchg(CI); return true; } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 0e40a92fd8d6..9db4c9e5e280 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2618,22 +2618,9 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, LoopInfo &LI, bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { BasicBlock *BB = CI->getParent(); - // Lower inline assembly if we can. - // If we found an inline asm expession, and if the target knows how to - // lower it to normal LLVM code, do so now. - if (CI->isInlineAsm()) { - if (TLI->ExpandInlineAsm(CI)) { - // Avoid invalidating the iterator. - CurInstIterator = BB->begin(); - // Avoid processing instructions out of order, which could cause - // reuse before a value is defined. - SunkAddrs.clear(); - return true; - } - // Sink address computing for memory operands into the block. - if (optimizeInlineAsmInst(CI)) - return true; - } + // Sink address computing for memory operands into the block. + if (CI->isInlineAsm() && optimizeInlineAsmInst(CI)) + return true; // Align the pointer arguments to this call if the target thinks it's a good // idea diff --git a/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp b/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp index 442ec3840930..5d7e2b59c204 100644 --- a/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp +++ b/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp @@ -45,7 +45,7 @@ static cl::opt<bool> EnableNoTrapAfterNoreturn( "after noreturn calls, even if --trap-unreachable is set.")); void CodeGenTargetMachineImpl::initAsmInfo() { - MRI.reset(TheTarget.createMCRegInfo(getTargetTriple().str())); + MRI.reset(TheTarget.createMCRegInfo(getTargetTriple())); assert(MRI && "Unable to create reg info"); MII.reset(TheTarget.createMCInstrInfo()); assert(MII && "Unable to create instruction info"); @@ -53,12 +53,12 @@ void CodeGenTargetMachineImpl::initAsmInfo() { // to some backends having subtarget feature dependent module level // code generation. This is similar to the hack in the AsmPrinter for // module level assembly etc. - STI.reset(TheTarget.createMCSubtargetInfo( - getTargetTriple().str(), getTargetCPU(), getTargetFeatureString())); + STI.reset(TheTarget.createMCSubtargetInfo(getTargetTriple(), getTargetCPU(), + getTargetFeatureString())); assert(STI && "Unable to create subtarget info"); - MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo( - *MRI, getTargetTriple().str(), Options.MCOptions); + MCAsmInfo *TmpAsmInfo = + TheTarget.createMCAsmInfo(*MRI, getTargetTriple(), Options.MCOptions); // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0, // and if the old one gets included then MCAsmInfo will be NULL and // we'll crash later. diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 810dc29d728d..0522698adf18 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -68,7 +68,6 @@ CGOPT(bool, EnableUnsafeFPMath) CGOPT(bool, EnableNoInfsFPMath) CGOPT(bool, EnableNoNaNsFPMath) CGOPT(bool, EnableNoSignedZerosFPMath) -CGOPT(bool, EnableApproxFuncFPMath) CGOPT(bool, EnableNoTrappingFPMath) CGOPT(bool, EnableAIXExtendedAltivecABI) CGOPT(DenormalMode::DenormalModeKind, DenormalFPMath) @@ -245,12 +244,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(EnableNoSignedZerosFPMath); - static cl::opt<bool> EnableApproxFuncFPMath( - "enable-approx-func-fp-math", - cl::desc("Enable FP math optimizations that assume approx func"), - cl::init(false)); - CGBINDOPT(EnableApproxFuncFPMath); - static cl::opt<bool> EnableNoTrappingFPMath( "enable-no-trapping-fp-math", cl::desc("Enable setting the FP exceptions build " @@ -563,7 +556,6 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.NoInfsFPMath = getEnableNoInfsFPMath(); Options.NoNaNsFPMath = getEnableNoNaNsFPMath(); Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath(); - Options.ApproxFuncFPMath = getEnableApproxFuncFPMath(); Options.NoTrappingFPMath = getEnableNoTrappingFPMath(); DenormalMode::DenormalModeKind DenormKind = getDenormalFPMath(); @@ -718,7 +710,6 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math"); HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math"); HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math"); - HANDLE_BOOL_ATTR(EnableApproxFuncFPMathView, "approx-func-fp-math"); if (DenormalFPMathView->getNumOccurrences() > 0 && !F.hasFnAttribute("denormal-fp-math")) { diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index de95e0aaf2cb..7d355e6e365d 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -60,6 +60,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ComplexDeinterleavingPass.h" +#include "llvm/ADT/AllocatorList.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -263,6 +264,7 @@ public: }; using Addend = std::pair<Value *, bool>; + using AddendList = BumpPtrList<Addend>; using CompositeNode = ComplexDeinterleavingCompositeNode::CompositeNode; // Helper struct for holding info about potential partial multiplication @@ -291,7 +293,7 @@ private: SmallPtrSet<Instruction *, 16> FinalInstructions; /// Root instructions are instructions from which complex computation starts - std::map<Instruction *, CompositeNode *> RootToNode; + DenseMap<Instruction *, CompositeNode *> RootToNode; /// Topologically sorted root instructions SmallVector<Instruction *, 1> OrderedRoots; @@ -339,7 +341,7 @@ private: /// ComplexDeinterleavingOperation::ReductionPHI node replacement. It is then /// used in the ComplexDeinterleavingOperation::ReductionOperation node /// replacement process. - std::map<PHINode *, PHINode *> OldToNewPHI; + DenseMap<PHINode *, PHINode *> OldToNewPHI; CompositeNode *prepareCompositeNode(ComplexDeinterleavingOperation Operation, Value *R, Value *I) { @@ -417,28 +419,28 @@ private: /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. /// Return nullptr if it is not possible to construct a complex number. /// \p Flags are needed to generate symmetric Add and Sub operations. - CompositeNode *identifyAdditions(std::list<Addend> &RealAddends, - std::list<Addend> &ImagAddends, + CompositeNode *identifyAdditions(AddendList &RealAddends, + AddendList &ImagAddends, std::optional<FastMathFlags> Flags, CompositeNode *Accumulator); /// Extract one addend that have both real and imaginary parts positive. - CompositeNode *extractPositiveAddend(std::list<Addend> &RealAddends, - std::list<Addend> &ImagAddends); + CompositeNode *extractPositiveAddend(AddendList &RealAddends, + AddendList &ImagAddends); /// Determine if sum of multiplications of complex numbers can be formed from /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result /// to it. Return nullptr if it is not possible to construct a complex number. - CompositeNode *identifyMultiplications(std::vector<Product> &RealMuls, - std::vector<Product> &ImagMuls, + CompositeNode *identifyMultiplications(SmallVectorImpl<Product> &RealMuls, + SmallVectorImpl<Product> &ImagMuls, CompositeNode *Accumulator); /// Go through pairs of multiplication (one Real and one Imag) and find all /// possible candidates for partial multiplication and put them into \p /// Candidates. Returns true if all Product has pair with common operand - bool collectPartialMuls(const std::vector<Product> &RealMuls, - const std::vector<Product> &ImagMuls, - std::vector<PartialMulCandidate> &Candidates); + bool collectPartialMuls(ArrayRef<Product> RealMuls, + ArrayRef<Product> ImagMuls, + SmallVectorImpl<PartialMulCandidate> &Candidates); /// If the code is compiled with -Ofast or expressions have `reassoc` flag, /// the order of complex computation operations may be significantly altered, @@ -1255,8 +1257,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, // Collect multiplications and addend instructions from the given instruction // while traversing it operands. Additionally, verify that all instructions // have the same fast math flags. - auto Collect = [&Flags](Instruction *Insn, std::vector<Product> &Muls, - std::list<Addend> &Addends) -> bool { + auto Collect = [&Flags](Instruction *Insn, SmallVectorImpl<Product> &Muls, + AddendList &Addends) -> bool { SmallVector<PointerIntPair<Value *, 1, bool>> Worklist = {{Insn, true}}; SmallPtrSet<Value *, 8> Visited; while (!Worklist.empty()) { @@ -1336,8 +1338,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, return true; }; - std::vector<Product> RealMuls, ImagMuls; - std::list<Addend> RealAddends, ImagAddends; + SmallVector<Product> RealMuls, ImagMuls; + AddendList RealAddends, ImagAddends; if (!Collect(Real, RealMuls, RealAddends) || !Collect(Imag, ImagMuls, ImagAddends)) return nullptr; @@ -1371,8 +1373,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, } bool ComplexDeinterleavingGraph::collectPartialMuls( - const std::vector<Product> &RealMuls, const std::vector<Product> &ImagMuls, - std::vector<PartialMulCandidate> &PartialMulCandidates) { + ArrayRef<Product> RealMuls, ArrayRef<Product> ImagMuls, + SmallVectorImpl<PartialMulCandidate> &PartialMulCandidates) { // Helper function to extract a common operand from two products auto FindCommonInstruction = [](const Product &Real, const Product &Imag) -> Value * { @@ -1423,18 +1425,18 @@ bool ComplexDeinterleavingGraph::collectPartialMuls( ComplexDeinterleavingGraph::CompositeNode * ComplexDeinterleavingGraph::identifyMultiplications( - std::vector<Product> &RealMuls, std::vector<Product> &ImagMuls, + SmallVectorImpl<Product> &RealMuls, SmallVectorImpl<Product> &ImagMuls, CompositeNode *Accumulator = nullptr) { if (RealMuls.size() != ImagMuls.size()) return nullptr; - std::vector<PartialMulCandidate> Info; + SmallVector<PartialMulCandidate> Info; if (!collectPartialMuls(RealMuls, ImagMuls, Info)) return nullptr; // Map to store common instruction to node pointers - std::map<Value *, CompositeNode *> CommonToNode; - std::vector<bool> Processed(Info.size(), false); + DenseMap<Value *, CompositeNode *> CommonToNode; + SmallVector<bool> Processed(Info.size(), false); for (unsigned I = 0; I < Info.size(); ++I) { if (Processed[I]) continue; @@ -1463,8 +1465,8 @@ ComplexDeinterleavingGraph::identifyMultiplications( } } - std::vector<bool> ProcessedReal(RealMuls.size(), false); - std::vector<bool> ProcessedImag(ImagMuls.size(), false); + SmallVector<bool> ProcessedReal(RealMuls.size(), false); + SmallVector<bool> ProcessedImag(ImagMuls.size(), false); CompositeNode *Result = Accumulator; for (auto &PMI : Info) { if (ProcessedReal[PMI.RealIdx] || ProcessedImag[PMI.ImagIdx]) @@ -1580,7 +1582,7 @@ ComplexDeinterleavingGraph::identifyMultiplications( ComplexDeinterleavingGraph::CompositeNode * ComplexDeinterleavingGraph::identifyAdditions( - std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends, + AddendList &RealAddends, AddendList &ImagAddends, std::optional<FastMathFlags> Flags, CompositeNode *Accumulator = nullptr) { if (RealAddends.size() != ImagAddends.size()) return nullptr; @@ -1671,8 +1673,8 @@ ComplexDeinterleavingGraph::identifyAdditions( } ComplexDeinterleavingGraph::CompositeNode * -ComplexDeinterleavingGraph::extractPositiveAddend( - std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends) { +ComplexDeinterleavingGraph::extractPositiveAddend(AddendList &RealAddends, + AddendList &ImagAddends) { for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) { for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { auto [R, IsPositiveR] = *ItR; diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp index 1c1047c1ce18..9cc6c6a706c5 100644 --- a/llvm/lib/CodeGen/ExpandFp.cpp +++ b/llvm/lib/CodeGen/ExpandFp.cpp @@ -16,18 +16,29 @@ #include "llvm/CodeGen/ExpandFp.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/SimplifyQuery.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <optional> + +#define DEBUG_TYPE "expand-fp" using namespace llvm; @@ -37,6 +48,359 @@ static cl::opt<unsigned> cl::desc("fp convert instructions on integers with " "more than <N> bits are expanded.")); +namespace { +/// This class implements a precise expansion of the frem instruction. +/// The generated code is based on the fmod implementation in the AMD device +/// libs. +class FRemExpander { + /// The IRBuilder to use for the expansion. + IRBuilder<> &B; + + /// Floating point type of the return value and the arguments of the FRem + /// instructions that should be expanded. + Type *FremTy; + + /// Floating point type to use for the computation. This may be + /// wider than the \p FremTy. + Type *ComputeFpTy; + + /// Integer type used to hold the exponents returned by frexp. + Type *ExTy; + + /// How many bits of the quotient to compute per iteration of the + /// algorithm, stored as a value of type \p ExTy. + Value *Bits; + + /// Constant 1 of type \p ExTy. + Value *One; + +public: + static bool canExpandType(Type *Ty) { + // TODO The expansion should work for other floating point types + // as well, but this would require additional testing. + return Ty->isIEEELikeFPTy() && !Ty->isBFloatTy() && !Ty->isFP128Ty(); + } + + static FRemExpander create(IRBuilder<> &B, Type *Ty) { + assert(canExpandType(Ty)); + + // The type to use for the computation of the remainder. This may be + // wider than the input/result type which affects the ... + Type *ComputeTy = Ty; + // ... maximum number of iterations of the remainder computation loop + // to use. This value is for the case in which the computation + // uses the same input/result type. + unsigned MaxIter = 2; + + if (Ty->isHalfTy()) { + // Use the wider type and less iterations. + ComputeTy = B.getFloatTy(); + MaxIter = 1; + } + + unsigned Precision = + llvm::APFloat::semanticsPrecision(Ty->getFltSemantics()); + return FRemExpander{B, Ty, Precision / MaxIter, ComputeTy}; + } + + /// Build the FRem expansion for the numerator \p X and the + /// denumerator \p Y. The type of X and Y must match \p FremTy. The + /// code will be generated at the insertion point of \p B and the + /// insertion point will be reset at exit. + Value *buildFRem(Value *X, Value *Y, std::optional<SimplifyQuery> &SQ) const; + + /// Build an approximate FRem expansion for the numerator \p X and + /// the denumerator \p Y at the insertion point of builder \p B. + /// The type of X and Y must match \p FremTy. + Value *buildApproxFRem(Value *X, Value *Y) const; + +private: + FRemExpander(IRBuilder<> &B, Type *FremTy, unsigned Bits, Type *ComputeFpTy) + : B(B), FremTy(FremTy), ComputeFpTy(ComputeFpTy), ExTy(B.getInt32Ty()), + Bits(ConstantInt::get(ExTy, Bits)), One(ConstantInt::get(ExTy, 1)) {}; + + Value *createRcp(Value *V, const Twine &Name) const { + // Leave it to later optimizations to turn this into an rcp + // instruction if available. + return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name); + } + + // Helper function to build the UPDATE_AX code which is common to the + // loop body and the "final iteration". + Value *buildUpdateAx(Value *Ax, Value *Ay, Value *Ayinv) const { + // Build: + // float q = rint(ax * ayinv); + // ax = fma(-q, ay, ax); + // int clt = ax < 0.0f; + // float axp = ax + ay; + // ax = clt ? axp : ax; + Value *Q = B.CreateUnaryIntrinsic(Intrinsic::rint, B.CreateFMul(Ax, Ayinv), + {}, "q"); + Value *AxUpdate = B.CreateFMA(B.CreateFNeg(Q), Ay, Ax, {}, "ax"); + Value *Clt = B.CreateFCmp(CmpInst::FCMP_OLT, AxUpdate, + ConstantFP::getZero(ComputeFpTy), "clt"); + Value *Axp = B.CreateFAdd(AxUpdate, Ay, "axp"); + return B.CreateSelect(Clt, Axp, AxUpdate, "ax"); + } + + /// Build code to extract the exponent and mantissa of \p Src. + /// Return the exponent minus one for use as a loop bound and + /// the mantissa taken to the given \p NewExp power. + std::pair<Value *, Value *> buildExpAndPower(Value *Src, Value *NewExp, + const Twine &ExName, + const Twine &PowName) const { + // Build: + // ExName = frexp_exp(Src) - 1; + // PowName = fldexp(frexp_mant(ExName), NewExp); + Type *Ty = Src->getType(); + Type *ExTy = B.getInt32Ty(); + Value *Frexp = B.CreateIntrinsic(Intrinsic::frexp, {Ty, ExTy}, Src); + Value *Mant = B.CreateExtractValue(Frexp, {0}); + Value *Exp = B.CreateExtractValue(Frexp, {1}); + + Exp = B.CreateSub(Exp, One, ExName); + Value *Pow = B.CreateLdexp(Mant, NewExp, {}, PowName); + + return {Pow, Exp}; + } + + /// Build the main computation of the remainder for the case in which + /// Ax > Ay, where Ax = |X|, Ay = |Y|, and X is the numerator and Y the + /// denumerator. Add the incoming edge from the computation result + /// to \p RetPhi. + void buildRemainderComputation(Value *AxInitial, Value *AyInitial, Value *X, + PHINode *RetPhi, FastMathFlags FMF) const { + IRBuilder<>::FastMathFlagGuard Guard(B); + B.setFastMathFlags(FMF); + + // Build: + // ex = frexp_exp(ax) - 1; + // ax = fldexp(frexp_mant(ax), bits); + // ey = frexp_exp(ay) - 1; + // ay = fledxp(frexp_mant(ay), 1); + auto [Ax, Ex] = buildExpAndPower(AxInitial, Bits, "ex", "ax"); + auto [Ay, Ey] = buildExpAndPower(AyInitial, One, "ey", "ay"); + + // Build: + // int nb = ex - ey; + // float ayinv = 1.0/ay; + Value *Nb = B.CreateSub(Ex, Ey, "nb"); + Value *Ayinv = createRcp(Ay, "ayinv"); + + // Build: while (nb > bits) + BasicBlock *PreheaderBB = B.GetInsertBlock(); + Function *Fun = PreheaderBB->getParent(); + auto *LoopBB = BasicBlock::Create(B.getContext(), "frem.loop_body", Fun); + auto *ExitBB = BasicBlock::Create(B.getContext(), "frem.loop_exit", Fun); + + B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, Nb, Bits), LoopBB, ExitBB); + + // Build loop body: + // UPDATE_AX + // ax = fldexp(ax, bits); + // nb -= bits; + // One iteration of the loop is factored out. The code shared by + // the loop and this "iteration" is denoted by UPDATE_AX. + B.SetInsertPoint(LoopBB); + PHINode *NbIv = B.CreatePHI(Nb->getType(), 2, "nb_iv"); + NbIv->addIncoming(Nb, PreheaderBB); + + auto *AxPhi = B.CreatePHI(ComputeFpTy, 2, "ax_loop_phi"); + AxPhi->addIncoming(Ax, PreheaderBB); + + Value *AxPhiUpdate = buildUpdateAx(AxPhi, Ay, Ayinv); + AxPhiUpdate = B.CreateLdexp(AxPhiUpdate, Bits, {}, "ax_update"); + AxPhi->addIncoming(AxPhiUpdate, LoopBB); + NbIv->addIncoming(B.CreateSub(NbIv, Bits, "nb_update"), LoopBB); + + B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, NbIv, Bits), LoopBB, ExitBB); + + // Build final iteration + // ax = fldexp(ax, nb - bits + 1); + // UPDATE_AX + B.SetInsertPoint(ExitBB); + + auto *AxPhiExit = B.CreatePHI(ComputeFpTy, 2, "ax_exit_phi"); + AxPhiExit->addIncoming(Ax, PreheaderBB); + AxPhiExit->addIncoming(AxPhi, LoopBB); + auto *NbExitPhi = B.CreatePHI(Nb->getType(), 2, "nb_exit_phi"); + NbExitPhi->addIncoming(NbIv, LoopBB); + NbExitPhi->addIncoming(Nb, PreheaderBB); + + Value *AxFinal = B.CreateLdexp( + AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), {}, "ax"); + AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv); + + // Build: + // ax = fldexp(ax, ey); + // ret = copysign(ax,x); + AxFinal = B.CreateLdexp(AxFinal, Ey, {}, "ax"); + if (ComputeFpTy != FremTy) + AxFinal = B.CreateFPTrunc(AxFinal, FremTy); + Value *Ret = B.CreateCopySign(AxFinal, X); + + RetPhi->addIncoming(Ret, ExitBB); + } + + /// Build the else-branch of the conditional in the FRem + /// expansion, i.e. the case in wich Ax <= Ay, where Ax = |X|, Ay + /// = |Y|, and X is the numerator and Y the denumerator. Add the + /// incoming edge from the result to \p RetPhi. + void buildElseBranch(Value *Ax, Value *Ay, Value *X, PHINode *RetPhi) const { + // Build: + // ret = ax == ay ? copysign(0.0f, x) : x; + Value *ZeroWithXSign = B.CreateCopySign(ConstantFP::getZero(FremTy), X); + Value *Ret = B.CreateSelect(B.CreateFCmpOEQ(Ax, Ay), ZeroWithXSign, X); + + RetPhi->addIncoming(Ret, B.GetInsertBlock()); + } + + /// Return a value that is NaN if one of the corner cases concerning + /// the inputs \p X and \p Y is detected, and \p Ret otherwise. + Value *handleInputCornerCases(Value *Ret, Value *X, Value *Y, + std::optional<SimplifyQuery> &SQ, + bool NoInfs) const { + // Build: + // ret = (y == 0.0f || isnan(y)) ? QNAN : ret; + // ret = isfinite(x) ? ret : QNAN; + Value *Nan = ConstantFP::getQNaN(FremTy); + Ret = B.CreateSelect(B.CreateFCmpUEQ(Y, ConstantFP::getZero(FremTy)), Nan, + Ret); + Value *XFinite = + NoInfs || (SQ && isKnownNeverInfinity(X, *SQ)) + ? B.getTrue() + : B.CreateFCmpULT(B.CreateUnaryIntrinsic(Intrinsic::fabs, X), + ConstantFP::getInfinity(FremTy)); + Ret = B.CreateSelect(XFinite, Ret, Nan); + + return Ret; + } +}; + +Value *FRemExpander::buildApproxFRem(Value *X, Value *Y) const { + IRBuilder<>::FastMathFlagGuard Guard(B); + // Propagating the approximate functions flag to the + // division leads to an unacceptable drop in precision + // on AMDGPU. + // TODO Find out if any flags might be worth propagating. + B.clearFastMathFlags(); + + Value *Quot = B.CreateFDiv(X, Y); + Value *Trunc = B.CreateUnaryIntrinsic(Intrinsic::trunc, Quot, {}); + Value *Neg = B.CreateFNeg(Trunc); + + return B.CreateFMA(Neg, Y, X); +} + +Value *FRemExpander::buildFRem(Value *X, Value *Y, + std::optional<SimplifyQuery> &SQ) const { + assert(X->getType() == FremTy && Y->getType() == FremTy); + + FastMathFlags FMF = B.getFastMathFlags(); + + // This function generates the following code structure: + // if (abs(x) > abs(y)) + // { ret = compute remainder } + // else + // { ret = x or 0 with sign of x } + // Adjust ret to NaN/inf in input + // return ret + Value *Ax = B.CreateUnaryIntrinsic(Intrinsic::fabs, X, {}, "ax"); + Value *Ay = B.CreateUnaryIntrinsic(Intrinsic::fabs, Y, {}, "ay"); + if (ComputeFpTy != X->getType()) { + Ax = B.CreateFPExt(Ax, ComputeFpTy, "ax"); + Ay = B.CreateFPExt(Ay, ComputeFpTy, "ay"); + } + Value *AxAyCmp = B.CreateFCmpOGT(Ax, Ay); + + PHINode *RetPhi = B.CreatePHI(FremTy, 2, "ret"); + Value *Ret = RetPhi; + + // We would return NaN in all corner cases handled here. + // Hence, if NaNs are excluded, keep the result as it is. + if (!FMF.noNaNs()) + Ret = handleInputCornerCases(Ret, X, Y, SQ, FMF.noInfs()); + + Function *Fun = B.GetInsertBlock()->getParent(); + auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun); + auto *ElseBB = BasicBlock::Create(B.getContext(), "frem.else", Fun); + SplitBlockAndInsertIfThenElse(AxAyCmp, RetPhi, &ThenBB, &ElseBB); + + auto SavedInsertPt = B.GetInsertPoint(); + + // Build remainder computation for "then" branch + // + // The ordered comparison ensures that ax and ay are not NaNs + // in the then-branch. Furthermore, y cannot be an infinity and the + // check at the end of the function ensures that the result will not + // be used if x is an infinity. + FastMathFlags ComputeFMF = FMF; + ComputeFMF.setNoInfs(); + ComputeFMF.setNoNaNs(); + + B.SetInsertPoint(ThenBB); + buildRemainderComputation(Ax, Ay, X, RetPhi, FMF); + B.CreateBr(RetPhi->getParent()); + + // Build "else"-branch + B.SetInsertPoint(ElseBB); + buildElseBranch(Ax, Ay, X, RetPhi); + B.CreateBr(RetPhi->getParent()); + + B.SetInsertPoint(SavedInsertPt); + + return Ret; +} +} // namespace + +static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) { + LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n'); + + Type *ReturnTy = I.getType(); + assert(FRemExpander::canExpandType(ReturnTy->getScalarType())); + + FastMathFlags FMF = I.getFastMathFlags(); + // TODO Make use of those flags for optimization? + FMF.setAllowReciprocal(false); + FMF.setAllowContract(false); + + IRBuilder<> B(&I); + B.setFastMathFlags(FMF); + B.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *ElemTy = ReturnTy->getScalarType(); + const FRemExpander Expander = FRemExpander::create(B, ElemTy); + + Value *Ret; + if (ReturnTy->isFloatingPointTy()) + Ret = FMF.approxFunc() + ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1)) + : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ); + else { + auto *VecTy = cast<FixedVectorType>(ReturnTy); + + // This could use SplitBlockAndInsertForEachLane but the interface + // is a bit awkward for a constant number of elements and it will + // boil down to the same code. + // TODO Expand the FRem instruction only once and reuse the code. + Value *Nums = I.getOperand(0); + Value *Denums = I.getOperand(1); + Ret = PoisonValue::get(I.getType()); + for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) { + Value *Num = B.CreateExtractElement(Nums, I); + Value *Denum = B.CreateExtractElement(Denums, I); + Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum) + : Expander.buildFRem(Num, Denum, SQ); + Ret = B.CreateInsertElement(Ret, Rem, I); + } + } + + I.replaceAllUsesWith(Ret); + Ret->takeName(&I); + I.eraseFromParent(); + + return true; +} // clang-format off: preserve formatting of the following example /// Generate code to convert a fp number to integer, replacing FPToS(U)I with @@ -64,8 +428,8 @@ static cl::opt<unsigned> /// br i1 %cmp6.not, label %if.end12, label %if.then8 /// /// if.then8: ; preds = %if.end -/// %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 -9223372036854775808 -/// br label %cleanup +/// %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 +/// -9223372036854775808 br label %cleanup /// /// if.end12: ; preds = %if.end /// %cmp13 = icmp ult i64 %shr, 150 @@ -83,9 +447,10 @@ static cl::opt<unsigned> /// %mul19 = mul nsw i64 %shl, %conv /// br label %cleanup /// -/// cleanup: ; preds = %entry, %if.else, %if.then15, %if.then8 -/// %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ %mul19, %if.else ], [ 0, %entry ] -/// ret i64 %retval.0 +/// cleanup: ; preds = %entry, +/// %if.else, %if.then15, %if.then8 +/// %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ +/// %mul19, %if.else ], [ 0, %entry ] ret i64 %retval.0 /// } /// /// Replace fp to integer with generated code. @@ -272,13 +637,11 @@ static void expandFPToI(Instruction *FPToI) { /// %or = or i64 %shr6, %conv11 /// br label %sw.epilog /// -/// sw.epilog: ; preds = %sw.default, %if.then4, %sw.bb -/// %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, %sw.bb ] -/// %1 = lshr i64 %a.addr.0, 2 -/// %2 = and i64 %1, 1 -/// %or16 = or i64 %2, %a.addr.0 -/// %inc = add nsw i64 %or16, 1 -/// %3 = and i64 %inc, 67108864 +/// sw.epilog: ; preds = %sw.default, +/// %if.then4, %sw.bb +/// %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, +/// %sw.bb ] %1 = lshr i64 %a.addr.0, 2 %2 = and i64 %1, 1 %or16 = or i64 %2, +/// %a.addr.0 %inc = add nsw i64 %or16, 1 %3 = and i64 %inc, 67108864 /// %tobool.not = icmp eq i64 %3, 0 /// %spec.select.v = select i1 %tobool.not, i64 2, i64 3 /// %spec.select = ashr i64 %inc, %spec.select.v @@ -291,7 +654,8 @@ static void expandFPToI(Instruction *FPToI) { /// %shl25 = shl i64 %sub, %sh_prom24 /// br label %if.end26 /// -/// if.end26: ; preds = %sw.epilog, %if.else +/// if.end26: ; preds = %sw.epilog, +/// %if.else /// %a.addr.1 = phi i64 [ %shl25, %if.else ], [ %spec.select, %sw.epilog ] /// %e.0 = phi i32 [ %sub2, %if.else ], [ %spec.select56, %sw.epilog ] /// %conv27 = trunc i64 %shr to i32 @@ -305,7 +669,8 @@ static void expandFPToI(Instruction *FPToI) { /// %4 = bitcast i32 %or33 to float /// br label %return /// -/// return: ; preds = %entry, %if.end26 +/// return: ; preds = %entry, +/// %if.end26 /// %retval.0 = phi float [ %4, %if.end26 ], [ 0.000000e+00, %entry ] /// ret float %retval.0 /// } @@ -594,7 +959,38 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) { I->eraseFromParent(); } -static bool runImpl(Function &F, const TargetLowering &TLI) { +// This covers all floating point types; more than we need here. +// TODO Move somewhere else for general use? +/// Return the Libcall for a frem instruction of +/// type \p Ty. +static RTLIB::Libcall fremToLibcall(Type *Ty) { + assert(Ty->isFloatingPointTy()); + if (Ty->isFloatTy() || Ty->is16bitFPTy()) + return RTLIB::REM_F32; + if (Ty->isDoubleTy()) + return RTLIB::REM_F64; + if (Ty->isFP128Ty()) + return RTLIB::REM_F128; + if (Ty->isX86_FP80Ty()) + return RTLIB::REM_F80; + if (Ty->isPPC_FP128Ty()) + return RTLIB::REM_PPCF128; + + llvm_unreachable("Unknown floating point type"); +} + +/* Return true if, according to \p LibInfo, the target either directly + supports the frem instruction for the \p Ty, has a custom lowering, + or uses a libcall. */ +static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) { + if (!TLI.isOperationExpand(ISD::FREM, EVT::getEVT(Ty))) + return true; + + return TLI.getLibcallName(fremToLibcall(Ty->getScalarType())); +} + +static bool runImpl(Function &F, const TargetLowering &TLI, + AssumptionCache *AC) { SmallVector<Instruction *, 4> Replace; SmallVector<Instruction *, 4> ReplaceVector; bool Modified = false; @@ -609,6 +1005,21 @@ static bool runImpl(Function &F, const TargetLowering &TLI) { for (auto &I : instructions(F)) { switch (I.getOpcode()) { + case Instruction::FRem: { + Type *Ty = I.getType(); + // TODO: This pass doesn't handle scalable vectors. + if (Ty->isScalableTy()) + continue; + + if (targetSupportsFrem(TLI, Ty) || + !FRemExpander::canExpandType(Ty->getScalarType())) + continue; + + Replace.push_back(&I); + Modified = true; + + break; + } case Instruction::FPToUI: case Instruction::FPToSI: { // TODO: This pass doesn't handle scalable vectors. @@ -659,8 +1070,20 @@ static bool runImpl(Function &F, const TargetLowering &TLI) { while (!Replace.empty()) { Instruction *I = Replace.pop_back_val(); - if (I->getOpcode() == Instruction::FPToUI || - I->getOpcode() == Instruction::FPToSI) { + if (I->getOpcode() == Instruction::FRem) { + auto SQ = [&]() -> std::optional<SimplifyQuery> { + if (AC) { + auto Res = std::make_optional<SimplifyQuery>( + I->getModule()->getDataLayout(), I); + Res->AC = AC; + return Res; + } + return {}; + }(); + + expandFRem(cast<BinaryOperator>(*I), SQ); + } else if (I->getOpcode() == Instruction::FPToUI || + I->getOpcode() == Instruction::FPToSI) { expandFPToI(I); } else { expandIToFP(I); @@ -672,31 +1095,58 @@ static bool runImpl(Function &F, const TargetLowering &TLI) { namespace { class ExpandFpLegacyPass : public FunctionPass { + CodeGenOptLevel OptLevel; + public: static char ID; - ExpandFpLegacyPass() : FunctionPass(ID) { + ExpandFpLegacyPass(CodeGenOptLevel OptLevel) + : FunctionPass(ID), OptLevel(OptLevel) { initializeExpandFpLegacyPassPass(*PassRegistry::getPassRegistry()); } + ExpandFpLegacyPass() : ExpandFpLegacyPass(CodeGenOptLevel::None) {}; + bool runOnFunction(Function &F) override { auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering(); - return runImpl(F, *TLI); + AssumptionCache *AC = nullptr; + + if (OptLevel != CodeGenOptLevel::None && !F.hasOptNone()) + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + return runImpl(F, *TLI, AC); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetPassConfig>(); + if (OptLevel != CodeGenOptLevel::None) + AU.addRequired<AssumptionCacheTracker>(); AU.addPreserved<AAResultsWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } }; } // namespace +ExpandFpPass::ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel) + : TM(TM), OptLevel(OptLevel) {} + +void ExpandFpPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<ExpandFpPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << '<'; + OS << "O" << (int)OptLevel; + OS << '>'; +} + PreservedAnalyses ExpandFpPass::run(Function &F, FunctionAnalysisManager &FAM) { const TargetSubtargetInfo *STI = TM->getSubtargetImpl(F); - return runImpl(F, *STI->getTargetLowering()) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); + auto &TLI = *STI->getTargetLowering(); + AssumptionCache *AC = nullptr; + if (OptLevel != CodeGenOptLevel::None) + AC = &FAM.getResult<AssumptionAnalysis>(F); + return runImpl(F, TLI, AC) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } char ExpandFpLegacyPass::ID = 0; @@ -704,4 +1154,6 @@ INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp", "Expand certain fp instructions", false, false) INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp", "Expand fp", false, false) -FunctionPass *llvm::createExpandFpPass() { return new ExpandFpLegacyPass(); } +FunctionPass *llvm::createExpandFpPass(CodeGenOptLevel OptLevel) { + return new ExpandFpLegacyPass(OptLevel); +} diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index 753c65600770..03abc042e556 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -150,9 +150,8 @@ struct CachingVPExpander { ElementCount ElemCount); /// If needed, folds the EVL in the mask operand and discards the EVL - /// parameter. Returns a pair of the value of the intrinsic after the change - /// (if any) and whether the mask was actually folded. - std::pair<Value *, bool> foldEVLIntoMask(VPIntrinsic &VPI); + /// parameter. Returns true if the mask was actually folded. + bool foldEVLIntoMask(VPIntrinsic &VPI); /// "Remove" the %evl parameter of \p PI by setting it to the static vector /// length of the operation. Returns true if the %evl (if any) was effectively @@ -160,34 +159,31 @@ struct CachingVPExpander { bool discardEVLParameter(VPIntrinsic &PI); /// Lower this VP binary operator to a unpredicated binary operator. - Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder, - VPIntrinsic &PI); + bool expandPredicationInBinaryOperator(IRBuilder<> &Builder, VPIntrinsic &PI); /// Lower this VP int call to a unpredicated int call. - Value *expandPredicationToIntCall(IRBuilder<> &Builder, VPIntrinsic &PI); + bool expandPredicationToIntCall(IRBuilder<> &Builder, VPIntrinsic &PI); /// Lower this VP fp call to a unpredicated fp call. - Value *expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI, - unsigned UnpredicatedIntrinsicID); + bool expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI, + unsigned UnpredicatedIntrinsicID); /// Lower this VP reduction to a call to an unpredicated reduction intrinsic. - Value *expandPredicationInReduction(IRBuilder<> &Builder, - VPReductionIntrinsic &PI); + bool expandPredicationInReduction(IRBuilder<> &Builder, + VPReductionIntrinsic &PI); /// Lower this VP cast operation to a non-VP intrinsic. - Value *expandPredicationToCastIntrinsic(IRBuilder<> &Builder, - VPIntrinsic &VPI); + bool expandPredicationToCastIntrinsic(IRBuilder<> &Builder, VPIntrinsic &VPI); /// Lower this VP memory operation to a non-VP intrinsic. - Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, - VPIntrinsic &VPI); + bool expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI); /// Lower this VP comparison to a call to an unpredicated comparison. - Value *expandPredicationInComparison(IRBuilder<> &Builder, - VPCmpIntrinsic &PI); + bool expandPredicationInComparison(IRBuilder<> &Builder, VPCmpIntrinsic &PI); /// Query TTI and expand the vector predication in \p P accordingly. - Value *expandPredication(VPIntrinsic &PI); + bool expandPredication(VPIntrinsic &PI); /// Determine how and whether the VPIntrinsic \p VPI shall be expanded. This /// overrides TTI with the cl::opts listed at the top of this file. @@ -227,9 +223,8 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder, return Builder.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat); } -Value * -CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder, - VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder, + VPIntrinsic &VPI) { assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); @@ -261,14 +256,14 @@ CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder, Value *NewBinOp = Builder.CreateBinOp(OC, Op0, Op1, VPI.getName()); replaceOperation(*NewBinOp, VPI); - return NewBinOp; + return true; } -Value *CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder, - VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder, + VPIntrinsic &VPI) { std::optional<unsigned> FID = VPI.getFunctionalIntrinsicID(); if (!FID) - return nullptr; + return false; SmallVector<Value *, 2> Argument; for (unsigned i = 0; i < VPI.getNumOperands() - 3; i++) { Argument.push_back(VPI.getOperand(i)); @@ -276,10 +271,10 @@ Value *CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder, Value *NewOp = Builder.CreateIntrinsic(FID.value(), {VPI.getType()}, Argument, /*FMFSource=*/nullptr, VPI.getName()); replaceOperation(*NewOp, VPI); - return NewOp; + return true; } -Value *CachingVPExpander::expandPredicationToFPCall( +bool CachingVPExpander::expandPredicationToFPCall( IRBuilder<> &Builder, VPIntrinsic &VPI, unsigned UnpredicatedIntrinsicID) { assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); @@ -297,7 +292,7 @@ Value *CachingVPExpander::expandPredicationToFPCall( UnpredicatedIntrinsicID, {VPI.getType()}, Argument, /*FMFSource=*/nullptr, VPI.getName()); replaceOperation(*NewOp, VPI); - return NewOp; + return true; } case Intrinsic::fma: case Intrinsic::fmuladd: @@ -315,11 +310,11 @@ Value *CachingVPExpander::expandPredicationToFPCall( else NewOp = Builder.CreateCall(Fn, {Op0, Op1, Op2}, VPI.getName()); replaceOperation(*NewOp, VPI); - return NewOp; + return true; } } - return nullptr; + return false; } static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, @@ -331,9 +326,8 @@ static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, return getReductionIdentity(RdxID, EltTy, FMF); } -Value * -CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, - VPReductionIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationInReduction( + IRBuilder<> &Builder, VPReductionIntrinsic &VPI) { assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); @@ -391,11 +385,11 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, } replaceOperation(*Reduction, VPI); - return Reduction; + return true; } -Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder, - VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI) { Intrinsic::ID VPID = VPI.getIntrinsicID(); unsigned CastOpcode = VPIntrinsic::getFunctionalOpcodeForVP(VPID).value(); assert(Instruction::isCast(CastOpcode)); @@ -404,12 +398,11 @@ Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder, VPI.getType(), VPI.getName()); replaceOperation(*CastOp, VPI); - return CastOp; + return true; } -Value * -CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, - VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI) { assert(VPI.canIgnoreVectorLengthParam()); const auto &DL = VPI.getDataLayout(); @@ -469,11 +462,11 @@ CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, assert(NewMemoryInst); replaceOperation(*NewMemoryInst, VPI); - return NewMemoryInst; + return true; } -Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder, - VPCmpIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder, + VPCmpIntrinsic &VPI) { assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); @@ -487,7 +480,7 @@ Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder, auto *NewCmp = Builder.CreateCmp(Pred, Op0, Op1); replaceOperation(*NewCmp, VPI); - return NewCmp; + return true; } bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { @@ -516,17 +509,24 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { return true; } -std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { +bool CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n'); IRBuilder<> Builder(&VPI); // Ineffective %evl parameter and so nothing to do here. if (VPI.canIgnoreVectorLengthParam()) - return {&VPI, false}; + return false; // Only VP intrinsics can have an %evl parameter. Value *OldMaskParam = VPI.getMaskParam(); + if (!OldMaskParam) { + assert((VPI.getIntrinsicID() == Intrinsic::vp_merge || + VPI.getIntrinsicID() == Intrinsic::vp_select) && + "Unexpected VP intrinsic without mask operand"); + OldMaskParam = VPI.getArgOperand(0); + } + Value *OldEVLParam = VPI.getVectorLengthParam(); assert(OldMaskParam && "no mask param to fold the vl param into"); assert(OldEVLParam && "no EVL param to fold away"); @@ -538,7 +538,11 @@ std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { ElementCount ElemCount = VPI.getStaticVectorLength(); Value *VLMask = convertEVLToMask(Builder, OldEVLParam, ElemCount); Value *NewMaskParam = Builder.CreateAnd(VLMask, OldMaskParam); - VPI.setMaskParam(NewMaskParam); + if (VPI.getIntrinsicID() == Intrinsic::vp_merge || + VPI.getIntrinsicID() == Intrinsic::vp_select) + VPI.setArgOperand(0, NewMaskParam); + else + VPI.setMaskParam(NewMaskParam); // Drop the %evl parameter. discardEVLParameter(VPI); @@ -546,10 +550,10 @@ std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { "transformation did not render the evl param ineffective!"); // Reassess the modified instruction. - return {&VPI, true}; + return true; } -Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredication(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Lowering to unpredicated op: " << VPI << '\n'); IRBuilder<> Builder(&VPI); @@ -566,9 +570,8 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { if (auto *VPCmp = dyn_cast<VPCmpIntrinsic>(&VPI)) return expandPredicationInComparison(Builder, *VPCmp); - if (VPCastIntrinsic::isVPCast(VPI.getIntrinsicID())) { + if (VPCastIntrinsic::isVPCast(VPI.getIntrinsicID())) return expandPredicationToCastIntrinsic(Builder, VPI); - } switch (VPI.getIntrinsicID()) { default: @@ -578,6 +581,14 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { replaceOperation(*NewNegOp, VPI); return NewNegOp; } + case Intrinsic::vp_select: + case Intrinsic::vp_merge: { + assert(maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()); + Value *NewSelectOp = Builder.CreateSelect( + VPI.getOperand(0), VPI.getOperand(1), VPI.getOperand(2), VPI.getName()); + replaceOperation(*NewSelectOp, VPI); + return NewSelectOp; + } case Intrinsic::vp_abs: case Intrinsic::vp_smax: case Intrinsic::vp_smin: @@ -613,10 +624,10 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { } if (auto CID = VPI.getConstrainedIntrinsicID()) - if (Value *Call = expandPredicationToFPCall(Builder, VPI, *CID)) - return Call; + if (expandPredicationToFPCall(Builder, VPI, *CID)) + return true; - return &VPI; + return false; } //// } CachingVPExpander @@ -673,8 +684,7 @@ CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) { Changed = VPExpansionDetails::IntrinsicUpdated; break; case VPLegalization::Convert: - if (auto [NewVPI, Folded] = foldEVLIntoMask(VPI); Folded) { - (void)NewVPI; + if (foldEVLIntoMask(VPI)) { Changed = VPExpansionDetails::IntrinsicUpdated; ++NumFoldedVL; } @@ -688,7 +698,7 @@ CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) { case VPLegalization::Discard: llvm_unreachable("Invalid strategy for operators."); case VPLegalization::Convert: - if (Value *V = expandPredication(VPI); V != &VPI) { + if (expandPredication(VPI)) { ++NumLoweredVPOps; Changed = VPExpansionDetails::IntrinsicReplaced; } diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 90a18b86c1b1..b3c312569736 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -1256,7 +1256,7 @@ LLT CallLowering::ValueHandler::getStackValueStoreType( if (Flags.isPointer()) { LLT PtrTy = LLT::pointer(Flags.getPointerAddrSpace(), ValTy.getScalarSizeInBits()); - if (ValVT.isVector()) + if (ValVT.isVector() && ValVT.getVectorNumElements() != 1) return LLT::vector(ValTy.getElementCount(), PtrTy); return PtrTy; } diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 0674f5fd1ae0..0ebee2cfd868 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2094,6 +2094,68 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI, return true; } +bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI, + LshrOfTruncOfLshr &MatchInfo, + MachineInstr &ShiftMI) const { + assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR"); + + Register N0 = MI.getOperand(1).getReg(); + Register N1 = MI.getOperand(2).getReg(); + unsigned OpSizeInBits = MRI.getType(N0).getScalarSizeInBits(); + + APInt N1C, N001C; + if (!mi_match(N1, MRI, m_ICstOrSplat(N1C))) + return false; + auto N001 = ShiftMI.getOperand(2).getReg(); + if (!mi_match(N001, MRI, m_ICstOrSplat(N001C))) + return false; + + if (N001C.getBitWidth() > N1C.getBitWidth()) + N1C = N1C.zext(N001C.getBitWidth()); + else + N001C = N001C.zext(N1C.getBitWidth()); + + Register InnerShift = ShiftMI.getOperand(0).getReg(); + LLT InnerShiftTy = MRI.getType(InnerShift); + uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits(); + if ((N1C + N001C).ult(InnerShiftSize)) { + MatchInfo.Src = ShiftMI.getOperand(1).getReg(); + MatchInfo.ShiftAmt = N1C + N001C; + MatchInfo.ShiftAmtTy = MRI.getType(N001); + MatchInfo.InnerShiftTy = InnerShiftTy; + + if ((N001C + OpSizeInBits) == InnerShiftSize) + return true; + if (MRI.hasOneUse(N0) && MRI.hasOneUse(InnerShift)) { + MatchInfo.Mask = true; + MatchInfo.MaskVal = APInt(N1C.getBitWidth(), OpSizeInBits) - N1C; + return true; + } + } + return false; +} + +void CombinerHelper::applyLshrOfTruncOfLshr( + MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo) const { + assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR"); + + Register Dst = MI.getOperand(0).getReg(); + auto ShiftAmt = + Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt); + auto Shift = + Builder.buildLShr(MatchInfo.InnerShiftTy, MatchInfo.Src, ShiftAmt); + if (MatchInfo.Mask == true) { + APInt MaskVal = + APInt::getLowBitsSet(MatchInfo.InnerShiftTy.getScalarSizeInBits(), + MatchInfo.MaskVal.getZExtValue()); + auto Mask = Builder.buildConstant(MatchInfo.InnerShiftTy, MaskVal); + auto And = Builder.buildAnd(MatchInfo.InnerShiftTy, Shift, Mask); + Builder.buildTrunc(Dst, And); + } else + Builder.buildTrunc(Dst, Shift); + MI.eraseFromParent(); +} + bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const { assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 008c18837a52..b02465d99a60 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2916,6 +2916,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_SREM: case TargetOpcode::G_SMIN: case TargetOpcode::G_SMAX: + case TargetOpcode::G_ABDS: Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); @@ -2953,6 +2954,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return Legalized; case TargetOpcode::G_UDIV: case TargetOpcode::G_UREM: + case TargetOpcode::G_ABDU: Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); @@ -4742,6 +4744,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return lowerShlSat(MI); case G_ABS: return lowerAbsToAddXor(MI); + case G_ABDS: + case G_ABDU: { + bool IsSigned = MI.getOpcode() == G_ABDS; + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + if ((IsSigned && LI.isLegal({G_SMIN, Ty}) && LI.isLegal({G_SMAX, Ty})) || + (!IsSigned && LI.isLegal({G_UMIN, Ty}) && LI.isLegal({G_UMAX, Ty}))) { + return lowerAbsDiffToMinMax(MI); + } + return lowerAbsDiffToSelect(MI); + } case G_FABS: return lowerFAbs(MI); case G_SELECT: @@ -4773,6 +4785,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return lowerVectorReduction(MI); case G_VAARG: return lowerVAArg(MI); + case G_ATOMICRMW_SUB: { + auto [Ret, Mem, Val] = MI.getFirst3Regs(); + const LLT ValTy = MRI.getType(Val); + MachineMemOperand *MMO = *MI.memoperands_begin(); + + auto VNeg = MIRBuilder.buildNeg(ValTy, Val); + MIRBuilder.buildAtomicRMW(G_ATOMICRMW_ADD, Ret, Mem, VNeg, *MMO); + MI.eraseFromParent(); + return Legalized; + } } } @@ -5222,19 +5244,13 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, InsertVal = MI.getOperand(2).getReg(); Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); - - // TODO: Handle total scalarization case. - if (!NarrowVecTy.isVector()) - return UnableToLegalize; - LLT VecTy = MRI.getType(SrcVec); // If the index is a constant, we can really break this down as you would // expect, and index into the target size pieces. - int64_t IdxVal; auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI); if (MaybeCst) { - IdxVal = MaybeCst->Value.getSExtValue(); + uint64_t IdxVal = MaybeCst->Value.getZExtValue(); // Avoid out of bounds indexing the pieces. if (IdxVal >= VecTy.getNumElements()) { MIRBuilder.buildUndef(DstReg); @@ -5242,33 +5258,45 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, return Legalized; } - SmallVector<Register, 8> VecParts; - LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); + if (!NarrowVecTy.isVector()) { + SmallVector<Register, 8> SplitPieces; + extractParts(MI.getOperand(1).getReg(), NarrowVecTy, + VecTy.getNumElements(), SplitPieces, MIRBuilder, MRI); + if (IsInsert) { + SplitPieces[IdxVal] = InsertVal; + MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), SplitPieces); + } else { + MIRBuilder.buildCopy(MI.getOperand(0).getReg(), SplitPieces[IdxVal]); + } + } else { + SmallVector<Register, 8> VecParts; + LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); - // Build a sequence of NarrowTy pieces in VecParts for this operand. - LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, - TargetOpcode::G_ANYEXT); + // Build a sequence of NarrowTy pieces in VecParts for this operand. + LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, + TargetOpcode::G_ANYEXT); - unsigned NewNumElts = NarrowVecTy.getNumElements(); + unsigned NewNumElts = NarrowVecTy.getNumElements(); - LLT IdxTy = MRI.getType(Idx); - int64_t PartIdx = IdxVal / NewNumElts; - auto NewIdx = - MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); + LLT IdxTy = MRI.getType(Idx); + int64_t PartIdx = IdxVal / NewNumElts; + auto NewIdx = + MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); - if (IsInsert) { - LLT PartTy = MRI.getType(VecParts[PartIdx]); + if (IsInsert) { + LLT PartTy = MRI.getType(VecParts[PartIdx]); - // Use the adjusted index to insert into one of the subvectors. - auto InsertPart = MIRBuilder.buildInsertVectorElement( - PartTy, VecParts[PartIdx], InsertVal, NewIdx); - VecParts[PartIdx] = InsertPart.getReg(0); + // Use the adjusted index to insert into one of the subvectors. + auto InsertPart = MIRBuilder.buildInsertVectorElement( + PartTy, VecParts[PartIdx], InsertVal, NewIdx); + VecParts[PartIdx] = InsertPart.getReg(0); - // Recombine the inserted subvector with the others to reform the result - // vector. - buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); - } else { - MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); + // Recombine the inserted subvector with the others to reform the result + // vector. + buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); + } else { + MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); + } } MI.eraseFromParent(); @@ -5970,7 +5998,6 @@ LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt, return Legalized; } -// TODO: Optimize if constant shift amount. LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, LLT RequestedTy) { @@ -5992,6 +6019,27 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, if (DstEltSize % 2 != 0) return UnableToLegalize; + // Check if we should use multi-way splitting instead of recursive binary + // splitting. + // + // Multi-way splitting directly decomposes wide shifts (e.g., 128-bit -> + // 4×32-bit) in a single legalization step, avoiding the recursive overhead + // and dependency chains created by usual binary splitting approach + // (128->64->32). + // + // The >= 8 parts threshold ensures we only use this optimization when binary + // splitting would require multiple recursive passes, avoiding overhead for + // simple 2-way splits where binary approach is sufficient. + if (RequestedTy.isValid() && RequestedTy.isScalar() && + DstEltSize % RequestedTy.getSizeInBits() == 0) { + const unsigned NumParts = DstEltSize / RequestedTy.getSizeInBits(); + // Use multiway if we have 8 or more parts (i.e., would need 3+ recursive + // steps). + if (NumParts >= 8) + return narrowScalarShiftMultiway(MI, RequestedTy); + } + + // Fall back to binary splitting: // Ignore the input type. We can only go to exactly half the size of the // input. If that isn't small enough, the resulting pieces will be further // legalized. @@ -6080,6 +6128,358 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, return Legalized; } +Register LegalizerHelper::buildConstantShiftPart(unsigned Opcode, + unsigned PartIdx, + unsigned NumParts, + ArrayRef<Register> SrcParts, + const ShiftParams &Params, + LLT TargetTy, LLT ShiftAmtTy) { + auto WordShiftConst = getIConstantVRegVal(Params.WordShift, MRI); + auto BitShiftConst = getIConstantVRegVal(Params.BitShift, MRI); + assert(WordShiftConst && BitShiftConst && "Expected constants"); + + const unsigned ShiftWords = WordShiftConst->getZExtValue(); + const unsigned ShiftBits = BitShiftConst->getZExtValue(); + const bool NeedsInterWordShift = ShiftBits != 0; + + switch (Opcode) { + case TargetOpcode::G_SHL: { + // Data moves from lower indices to higher indices + // If this part would come from a source beyond our range, it's zero + if (PartIdx < ShiftWords) + return Params.Zero; + + unsigned SrcIdx = PartIdx - ShiftWords; + if (!NeedsInterWordShift) + return SrcParts[SrcIdx]; + + // Combine shifted main part with carry from previous part + auto Hi = MIRBuilder.buildShl(TargetTy, SrcParts[SrcIdx], Params.BitShift); + if (SrcIdx > 0) { + auto Lo = MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx - 1], + Params.InvBitShift); + return MIRBuilder.buildOr(TargetTy, Hi, Lo).getReg(0); + } + return Hi.getReg(0); + } + + case TargetOpcode::G_LSHR: { + unsigned SrcIdx = PartIdx + ShiftWords; + if (SrcIdx >= NumParts) + return Params.Zero; + if (!NeedsInterWordShift) + return SrcParts[SrcIdx]; + + // Combine shifted main part with carry from next part + auto Lo = MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx], Params.BitShift); + if (SrcIdx + 1 < NumParts) { + auto Hi = MIRBuilder.buildShl(TargetTy, SrcParts[SrcIdx + 1], + Params.InvBitShift); + return MIRBuilder.buildOr(TargetTy, Lo, Hi).getReg(0); + } + return Lo.getReg(0); + } + + case TargetOpcode::G_ASHR: { + // Like LSHR but preserves sign bit + unsigned SrcIdx = PartIdx + ShiftWords; + if (SrcIdx >= NumParts) + return Params.SignBit; + if (!NeedsInterWordShift) + return SrcParts[SrcIdx]; + + // Only the original MSB part uses arithmetic shift to preserve sign. All + // other parts use logical shift since they're just moving data bits. + auto Lo = + (SrcIdx == NumParts - 1) + ? MIRBuilder.buildAShr(TargetTy, SrcParts[SrcIdx], Params.BitShift) + : MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx], Params.BitShift); + Register HiSrc = + (SrcIdx + 1 < NumParts) ? SrcParts[SrcIdx + 1] : Params.SignBit; + auto Hi = MIRBuilder.buildShl(TargetTy, HiSrc, Params.InvBitShift); + return MIRBuilder.buildOr(TargetTy, Lo, Hi).getReg(0); + } + + default: + llvm_unreachable("not a shift"); + } +} + +Register LegalizerHelper::buildVariableShiftPart(unsigned Opcode, + Register MainOperand, + Register ShiftAmt, + LLT TargetTy, + Register CarryOperand) { + // This helper generates a single output part for variable shifts by combining + // the main operand (shifted by BitShift) with carry bits from an adjacent + // part. + + // For G_ASHR, individual parts don't have their own sign bit, only the + // complete value does. So we use LSHR for the main operand shift in ASHR + // context. + unsigned MainOpcode = + (Opcode == TargetOpcode::G_ASHR) ? TargetOpcode::G_LSHR : Opcode; + + // Perform the primary shift on the main operand + Register MainShifted = + MIRBuilder.buildInstr(MainOpcode, {TargetTy}, {MainOperand, ShiftAmt}) + .getReg(0); + + // No carry operand available + if (!CarryOperand.isValid()) + return MainShifted; + + // If BitShift is 0 (word-aligned shift), no inter-word bit movement occurs, + // so carry bits aren't needed. + LLT ShiftAmtTy = MRI.getType(ShiftAmt); + auto ZeroConst = MIRBuilder.buildConstant(ShiftAmtTy, 0); + LLT BoolTy = LLT::scalar(1); + auto IsZeroBitShift = + MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, ShiftAmt, ZeroConst); + + // Extract bits from the adjacent part that will "carry over" into this part. + // The carry direction is opposite to the main shift direction, so we can + // align the two shifted values before combining them with OR. + + // Determine the carry shift opcode (opposite direction) + unsigned CarryOpcode = (Opcode == TargetOpcode::G_SHL) ? TargetOpcode::G_LSHR + : TargetOpcode::G_SHL; + + // Calculate inverse shift amount: BitWidth - ShiftAmt + auto TargetBitsConst = + MIRBuilder.buildConstant(ShiftAmtTy, TargetTy.getScalarSizeInBits()); + auto InvShiftAmt = MIRBuilder.buildSub(ShiftAmtTy, TargetBitsConst, ShiftAmt); + + // Shift the carry operand + Register CarryBits = + MIRBuilder + .buildInstr(CarryOpcode, {TargetTy}, {CarryOperand, InvShiftAmt}) + .getReg(0); + + // If BitShift is 0, don't include carry bits (InvShiftAmt would equal + // TargetBits which would be poison for the individual carry shift operation). + auto ZeroReg = MIRBuilder.buildConstant(TargetTy, 0); + Register SafeCarryBits = + MIRBuilder.buildSelect(TargetTy, IsZeroBitShift, ZeroReg, CarryBits) + .getReg(0); + + // Combine the main shifted part with the carry bits + return MIRBuilder.buildOr(TargetTy, MainShifted, SafeCarryBits).getReg(0); +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::narrowScalarShiftByConstantMultiway(MachineInstr &MI, + const APInt &Amt, + LLT TargetTy, + LLT ShiftAmtTy) { + // Any wide shift can be decomposed into WordShift + BitShift components. + // When shift amount is known constant, directly compute the decomposition + // values and generate constant registers. + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(DstReg); + + const unsigned DstBits = DstTy.getScalarSizeInBits(); + const unsigned TargetBits = TargetTy.getScalarSizeInBits(); + const unsigned NumParts = DstBits / TargetBits; + + assert(DstBits % TargetBits == 0 && "Target type must evenly divide source"); + + // When the shift amount is known at compile time, we just calculate which + // source parts contribute to each output part. + + SmallVector<Register, 8> SrcParts; + extractParts(SrcReg, TargetTy, NumParts, SrcParts, MIRBuilder, MRI); + + if (Amt.isZero()) { + // No shift needed, just copy + MIRBuilder.buildMergeLikeInstr(DstReg, SrcParts); + MI.eraseFromParent(); + return Legalized; + } + + ShiftParams Params; + const unsigned ShiftWords = Amt.getZExtValue() / TargetBits; + const unsigned ShiftBits = Amt.getZExtValue() % TargetBits; + + // Generate constants and values needed by all shift types + Params.WordShift = MIRBuilder.buildConstant(ShiftAmtTy, ShiftWords).getReg(0); + Params.BitShift = MIRBuilder.buildConstant(ShiftAmtTy, ShiftBits).getReg(0); + Params.InvBitShift = + MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - ShiftBits).getReg(0); + Params.Zero = MIRBuilder.buildConstant(TargetTy, 0).getReg(0); + + // For ASHR, we need the sign-extended value to fill shifted-out positions + if (MI.getOpcode() == TargetOpcode::G_ASHR) + Params.SignBit = + MIRBuilder + .buildAShr(TargetTy, SrcParts[SrcParts.size() - 1], + MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1)) + .getReg(0); + + SmallVector<Register, 8> DstParts(NumParts); + for (unsigned I = 0; I < NumParts; ++I) + DstParts[I] = buildConstantShiftPart(MI.getOpcode(), I, NumParts, SrcParts, + Params, TargetTy, ShiftAmtTy); + + MIRBuilder.buildMergeLikeInstr(DstReg, DstParts); + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::narrowScalarShiftMultiway(MachineInstr &MI, LLT TargetTy) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register AmtReg = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT ShiftAmtTy = MRI.getType(AmtReg); + + const unsigned DstBits = DstTy.getScalarSizeInBits(); + const unsigned TargetBits = TargetTy.getScalarSizeInBits(); + const unsigned NumParts = DstBits / TargetBits; + + assert(DstBits % TargetBits == 0 && "Target type must evenly divide source"); + assert(isPowerOf2_32(TargetBits) && "Target bit width must be power of 2"); + + // If the shift amount is known at compile time, we can use direct indexing + // instead of generating select chains in the general case. + if (auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI)) + return narrowScalarShiftByConstantMultiway(MI, VRegAndVal->Value, TargetTy, + ShiftAmtTy); + + // For runtime-variable shift amounts, we must generate a more complex + // sequence that handles all possible shift values using select chains. + + // Split the input into target-sized pieces + SmallVector<Register, 8> SrcParts; + extractParts(SrcReg, TargetTy, NumParts, SrcParts, MIRBuilder, MRI); + + // Shifting by zero should be a no-op. + auto ZeroAmtConst = MIRBuilder.buildConstant(ShiftAmtTy, 0); + LLT BoolTy = LLT::scalar(1); + auto IsZeroShift = + MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, AmtReg, ZeroAmtConst); + + // Any wide shift can be decomposed into two components: + // 1. WordShift: number of complete target-sized words to shift + // 2. BitShift: number of bits to shift within each word + // + // Example: 128-bit >> 50 with 32-bit target: + // WordShift = 50 / 32 = 1 (shift right by 1 complete word) + // BitShift = 50 % 32 = 18 (shift each word right by 18 bits) + unsigned TargetBitsLog2 = Log2_32(TargetBits); + auto TargetBitsLog2Const = + MIRBuilder.buildConstant(ShiftAmtTy, TargetBitsLog2); + auto TargetBitsMask = MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1); + + Register WordShift = + MIRBuilder.buildLShr(ShiftAmtTy, AmtReg, TargetBitsLog2Const).getReg(0); + Register BitShift = + MIRBuilder.buildAnd(ShiftAmtTy, AmtReg, TargetBitsMask).getReg(0); + + // Fill values: + // - SHL/LSHR: fill with zeros + // - ASHR: fill with sign-extended MSB + Register ZeroReg = MIRBuilder.buildConstant(TargetTy, 0).getReg(0); + + Register FillValue; + if (MI.getOpcode() == TargetOpcode::G_ASHR) { + auto TargetBitsMinusOneConst = + MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1); + FillValue = MIRBuilder + .buildAShr(TargetTy, SrcParts[NumParts - 1], + TargetBitsMinusOneConst) + .getReg(0); + } else { + FillValue = ZeroReg; + } + + SmallVector<Register, 8> DstParts(NumParts); + + // For each output part, generate a select chain that chooses the correct + // result based on the runtime WordShift value. This handles all possible + // word shift amounts by pre-calculating what each would produce. + for (unsigned I = 0; I < NumParts; ++I) { + // Initialize with appropriate default value for this shift type + Register InBoundsResult = FillValue; + + // clang-format off + // Build a branchless select chain by pre-computing results for all possible + // WordShift values (0 to NumParts-1). Each iteration nests a new select: + // + // K=0: select(WordShift==0, result0, FillValue) + // K=1: select(WordShift==1, result1, select(WordShift==0, result0, FillValue)) + // K=2: select(WordShift==2, result2, select(WordShift==1, result1, select(...))) + // clang-format on + for (unsigned K = 0; K < NumParts; ++K) { + auto WordShiftKConst = MIRBuilder.buildConstant(ShiftAmtTy, K); + auto IsWordShiftK = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, + WordShift, WordShiftKConst); + + // Calculate source indices for this word shift + // + // For 4-part 128-bit value with K=1 word shift: + // SHL: [3][2][1][0] << K => [2][1][0][Z] + // -> (MainIdx = I-K, CarryIdx = I-K-1) + // LSHR: [3][2][1][0] >> K => [Z][3][2][1] + // -> (MainIdx = I+K, CarryIdx = I+K+1) + int MainSrcIdx; + int CarrySrcIdx; // Index for the word that provides the carried-in bits. + + switch (MI.getOpcode()) { + case TargetOpcode::G_SHL: + MainSrcIdx = (int)I - (int)K; + CarrySrcIdx = MainSrcIdx - 1; + break; + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: + MainSrcIdx = (int)I + (int)K; + CarrySrcIdx = MainSrcIdx + 1; + break; + default: + llvm_unreachable("Not a shift"); + } + + // Check bounds and build the result for this word shift + Register ResultForK; + if (MainSrcIdx >= 0 && MainSrcIdx < (int)NumParts) { + Register MainOp = SrcParts[MainSrcIdx]; + Register CarryOp; + + // Determine carry operand with bounds checking + if (CarrySrcIdx >= 0 && CarrySrcIdx < (int)NumParts) + CarryOp = SrcParts[CarrySrcIdx]; + else if (MI.getOpcode() == TargetOpcode::G_ASHR && + CarrySrcIdx >= (int)NumParts) + CarryOp = FillValue; // Use sign extension + + ResultForK = buildVariableShiftPart(MI.getOpcode(), MainOp, BitShift, + TargetTy, CarryOp); + } else { + // Out of bounds - use fill value for this k + ResultForK = FillValue; + } + + // Select this result if WordShift equals k + InBoundsResult = + MIRBuilder + .buildSelect(TargetTy, IsWordShiftK, ResultForK, InBoundsResult) + .getReg(0); + } + + // Handle zero-shift special case: if shift is 0, use original input + DstParts[I] = + MIRBuilder + .buildSelect(TargetTy, IsZeroShift, SrcParts[I], InBoundsResult) + .getReg(0); + } + + MIRBuilder.buildMergeLikeInstr(DstReg, DstParts); + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx, LLT MoreTy) { @@ -9537,6 +9937,54 @@ LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) { return Legalized; } +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerAbsDiffToSelect(MachineInstr &MI) { + assert((MI.getOpcode() == TargetOpcode::G_ABDS || + MI.getOpcode() == TargetOpcode::G_ABDU) && + "Expected G_ABDS or G_ABDU instruction"); + + auto [DstReg, LHS, RHS] = MI.getFirst3Regs(); + LLT Ty = MRI.getType(LHS); + + // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) + // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) + Register LHSSub = MIRBuilder.buildSub(Ty, LHS, RHS).getReg(0); + Register RHSSub = MIRBuilder.buildSub(Ty, RHS, LHS).getReg(0); + CmpInst::Predicate Pred = (MI.getOpcode() == TargetOpcode::G_ABDS) + ? CmpInst::ICMP_SGT + : CmpInst::ICMP_UGT; + auto ICmp = MIRBuilder.buildICmp(Pred, LLT::scalar(1), LHS, RHS); + MIRBuilder.buildSelect(DstReg, ICmp, LHSSub, RHSSub); + + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerAbsDiffToMinMax(MachineInstr &MI) { + assert((MI.getOpcode() == TargetOpcode::G_ABDS || + MI.getOpcode() == TargetOpcode::G_ABDU) && + "Expected G_ABDS or G_ABDU instruction"); + + auto [DstReg, LHS, RHS] = MI.getFirst3Regs(); + LLT Ty = MRI.getType(LHS); + + // abds(lhs, rhs) -→ sub(smax(lhs, rhs), smin(lhs, rhs)) + // abdu(lhs, rhs) -→ sub(umax(lhs, rhs), umin(lhs, rhs)) + Register MaxReg, MinReg; + if (MI.getOpcode() == TargetOpcode::G_ABDS) { + MaxReg = MIRBuilder.buildSMax(Ty, LHS, RHS).getReg(0); + MinReg = MIRBuilder.buildSMin(Ty, LHS, RHS).getReg(0); + } else { + MaxReg = MIRBuilder.buildUMax(Ty, LHS, RHS).getReg(0); + MinReg = MIRBuilder.buildUMin(Ty, LHS, RHS).getReg(0); + } + MIRBuilder.buildSub(DstReg, MaxReg, MinReg); + + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) { Register SrcReg = MI.getOperand(1).getReg(); Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index e41fd81953f4..58d631e569b3 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -466,8 +466,14 @@ llvm::getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI) { std::optional<DefinitionAndSourceRegister> llvm::getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) { Register DefSrcReg = Reg; - auto *DefMI = MRI.getVRegDef(Reg); - auto DstTy = MRI.getType(DefMI->getOperand(0).getReg()); + // This assumes that the code is in SSA form, so there should only be one + // definition. + auto DefIt = MRI.def_begin(Reg); + if (DefIt == MRI.def_end()) + return {}; + MachineOperand &DefOpnd = *DefIt; + MachineInstr *DefMI = DefOpnd.getParent(); + auto DstTy = MRI.getType(DefOpnd.getReg()); if (!DstTy.isValid()) return std::nullopt; unsigned Opc = DefMI->getOpcode(); diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 93f6e39b56ab..e3ded12a1847 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -537,28 +537,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore( "number of stored element should be a multiple of Factor"); Value *Mask = nullptr; + auto GapMask = APInt::getAllOnes(Factor); if (SI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. unsigned LaneMaskLen = NumStoredElements / Factor; - APInt GapMask(Factor, 0); std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen)); if (!Mask) return false; - // We haven't supported gap mask for stores. Yet it is possible that we - // already changed the IR, hence returning true here. - if (GapMask.popcount() != Factor) - return true; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " << *Store << "\n"); + LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor + << " and actual factor " << GapMask.popcount() << "\n"); } // Try to create target specific intrinsics to replace the store and // shuffle. - if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor)) + if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor, GapMask)) return false; // Already have a new target specific interleaved store. Erase the old store. @@ -662,6 +660,10 @@ static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, } if (auto *SVI = dyn_cast<ShuffleVectorInst>(WideMask)) { + Type *Op1Ty = SVI->getOperand(1)->getType(); + if (!isa<FixedVectorType>(Op1Ty)) + return {nullptr, GapMask}; + // Check that the shuffle mask is: a) an interleave, b) all of the same // set of the elements, and c) contained by the first source. (c) could // be relaxed if desired. diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp index f12f437c493e..9d98e6c085fe 100644 --- a/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -536,12 +536,6 @@ public: namespace llvm { -/// Implementation of the LiveDebugVariables pass. - -LiveDebugVariables::LiveDebugVariables() = default; -LiveDebugVariables::~LiveDebugVariables() = default; -LiveDebugVariables::LiveDebugVariables(LiveDebugVariables &&) = default; - class LiveDebugVariables::LDVImpl { LocMap::Allocator allocator; MachineFunction *MF = nullptr; @@ -683,6 +677,12 @@ public: void print(raw_ostream&); }; +/// Implementation of the LiveDebugVariables pass. + +LiveDebugVariables::LiveDebugVariables() = default; +LiveDebugVariables::~LiveDebugVariables() = default; +LiveDebugVariables::LiveDebugVariables(LiveDebugVariables &&) = default; + } // namespace llvm static void printDebugLoc(const DebugLoc &DL, raw_ostream &CommentOS, diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp index 116a919585d7..17a7f48e3f2e 100644 --- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp @@ -21,6 +21,10 @@ using namespace llvm; AnalysisKey MachineFunctionAnalysis::Key; +llvm::MachineFunctionAnalysis::Result::Result( + std::unique_ptr<MachineFunction> MF) + : MF(std::move(MF)) {} + bool MachineFunctionAnalysis::Result::invalidate( Function &, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &) { diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index d9e8484c08d7..da29ffc9d2fe 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -133,7 +133,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, SmallSetVector<Register, 32> LocalDefs; BitVector LocalDefsP(TRI->getNumRegUnits()); SmallSet<Register, 8> DeadDefSet; - SmallSet<Register, 16> KilledDefSet; SmallSetVector<Register, 8> ExternUses; SmallSet<Register, 8> KilledUseSet; SmallSet<Register, 8> UndefUseSet; @@ -151,7 +150,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, MO.setIsInternalRead(); if (MO.isKill()) { // Internal def is now killed. - KilledDefSet.insert(Reg); + DeadDefSet.insert(Reg); } } else { if (ExternUses.insert(Reg)) { @@ -171,21 +170,18 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, continue; if (LocalDefs.insert(Reg)) { - if (MO.isDead()) - DeadDefSet.insert(Reg); + if (!MO.isDead() && Reg.isPhysical()) { + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) + LocalDefsP.set(Unit); + } } else { - // Re-defined inside the bundle, it's no longer killed. - KilledDefSet.erase(Reg); if (!MO.isDead()) { - // Previously defined but dead. + // Re-defined inside the bundle, it's no longer dead. DeadDefSet.erase(Reg); } } - - if (!MO.isDead() && Reg.isPhysical()) { - for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) - LocalDefsP.set(Unit); - } + if (MO.isDead()) + DeadDefSet.insert(Reg); } // Set FrameSetup/FrameDestroy for the bundle. If any of the instructions @@ -198,7 +194,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, for (Register Reg : LocalDefs) { // If it's not live beyond end of the bundle, mark it dead. - bool isDead = DeadDefSet.contains(Reg) || KilledDefSet.contains(Reg); + bool isDead = DeadDefSet.contains(Reg); MIB.addReg(Reg, getDefRegState(true) | getDeadRegState(isDead) | getImplRegState(true)); } diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp index b0bce2c21a47..fdae3b470de0 100644 --- a/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/llvm/lib/CodeGen/MachineOutliner.cpp @@ -59,8 +59,10 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CGData/CodeGenDataReader.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -107,6 +109,16 @@ STATISTIC(StableHashAttempts, STATISTIC(StableHashDropped, "Count of unsuccessful hashing attempts for outlined functions"); STATISTIC(NumRemovedLOHs, "Total number of Linker Optimization Hints removed"); +STATISTIC(NumPGOBlockedOutlined, + "Number of times outlining was blocked by PGO"); +STATISTIC(NumPGOAllowedCold, + "Number of times outlining was allowed from cold functions"); +STATISTIC(NumPGOConservativeBlockedOutlined, + "Number of times outlining was blocked conservatively when profile " + "counts were missing"); +STATISTIC(NumPGOOptimisticOutlined, + "Number of times outlining was allowed optimistically when profile " + "counts were missing"); // Set to true if the user wants the outliner to run on linkonceodr linkage // functions. This is false by default because the linker can dedupe linkonceodr @@ -438,11 +450,10 @@ struct MachineOutliner : public ModulePass { /// The current repeat number of machine outlining. unsigned OutlineRepeatedNum = 0; - /// Set to true if the outliner should run on all functions in the module - /// considered safe for outlining. - /// Set to true by default for compatibility with llc's -run-pass option. - /// Set when the pass is constructed in TargetPassConfig. - bool RunOnAllFunctions = true; + /// The mode for whether to run the outliner + /// Set to always-outline by default for compatibility with llc's -run-pass + /// option. + RunOutliner RunOutlinerMode = RunOutliner::AlwaysOutline; /// This is a compact representation of hash sequences of outlined functions. /// It is used when OutlinerMode = CGDataMode::Write. @@ -468,6 +479,11 @@ struct MachineOutliner : public ModulePass { AU.addRequired<TargetPassConfig>(); AU.addPreserved<MachineModuleInfoWrapperPass>(); AU.addUsedIfAvailable<ImmutableModuleSummaryIndexWrapperPass>(); + if (RunOutlinerMode == RunOutliner::OptimisticPGO || + RunOutlinerMode == RunOutliner::ConservativePGO) { + AU.addRequired<BlockFrequencyInfoWrapperPass>(); + AU.addRequired<ProfileSummaryInfoWrapperPass>(); + } AU.setPreservesAll(); ModulePass::getAnalysisUsage(AU); } @@ -578,9 +594,9 @@ struct MachineOutliner : public ModulePass { char MachineOutliner::ID = 0; namespace llvm { -ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions) { +ModulePass *createMachineOutlinerPass(RunOutliner RunOutlinerMode) { MachineOutliner *OL = new MachineOutliner(); - OL->RunOnAllFunctions = RunOnAllFunctions; + OL->RunOutlinerMode = RunOutlinerMode; return OL; } @@ -1017,9 +1033,6 @@ MachineFunction *MachineOutliner::createOutlinedFunction( /* Outlined code is optimized code by definition. */ DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized); - // Don't add any new variables to the subprogram. - DB.finalizeSubprogram(OutlinedSP); - // Attach subprogram to the function. F->setSubprogram(OutlinedSP); // We're done with the DIBuilder. @@ -1201,10 +1214,49 @@ bool MachineOutliner::outline( return OutlinedSomething; } +static bool allowPGOOutlining(RunOutliner RunOutlinerMode, + const ProfileSummaryInfo *PSI, + const BlockFrequencyInfo *BFI, + MachineBasicBlock &MBB) { + if (RunOutlinerMode != RunOutliner::OptimisticPGO && + RunOutlinerMode != RunOutliner::ConservativePGO) + return true; + auto *MF = MBB.getParent(); + if (MF->getFunction().hasFnAttribute(Attribute::Cold)) { + ++NumPGOAllowedCold; + return true; + } + + auto *BB = MBB.getBasicBlock(); + if (BB && PSI && BFI) + if (auto Count = BFI->getBlockProfileCount(BB)) + return *Count <= PSI->getOrCompColdCountThreshold(); + + if (RunOutlinerMode == RunOutliner::OptimisticPGO) { + auto *TII = MF->getSubtarget().getInstrInfo(); + if (TII->shouldOutlineFromFunctionByDefault(*MF)) { + // Profile data is unavailable, but we optimistically allow outlining + ++NumPGOOptimisticOutlined; + return true; + } + return false; + } + assert(RunOutlinerMode == RunOutliner::ConservativePGO); + // Profile data is unavailable, so we conservatively block outlining + ++NumPGOConservativeBlockedOutlined; + return false; +} + void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) { // Build instruction mappings for each function in the module. Start by // iterating over each Function in M. LLVM_DEBUG(dbgs() << "*** Populating mapper ***\n"); + bool EnableProfileGuidedOutlining = + RunOutlinerMode == RunOutliner::OptimisticPGO || + RunOutlinerMode == RunOutliner::ConservativePGO; + ProfileSummaryInfo *PSI = nullptr; + if (EnableProfileGuidedOutlining) + PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); for (Function &F : M) { LLVM_DEBUG(dbgs() << "MAPPING FUNCTION: " << F.getName() << "\n"); @@ -1225,7 +1277,11 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) { } const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - if (!RunOnAllFunctions && !TII->shouldOutlineFromFunctionByDefault(*MF)) { + BlockFrequencyInfo *BFI = nullptr; + if (EnableProfileGuidedOutlining && F.hasProfileData()) + BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); + if (RunOutlinerMode == RunOutliner::TargetDefault && + !TII->shouldOutlineFromFunctionByDefault(*MF)) { LLVM_DEBUG(dbgs() << "SKIP: Target does not want to outline from " "function by default\n"); continue; @@ -1265,6 +1321,11 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) { continue; } + if (!allowPGOOutlining(RunOutlinerMode, PSI, BFI, MBB)) { + ++NumPGOBlockedOutlined; + continue; + } + // MBB is suitable for outlining. Map it to a list of unsigneds. Mapper.convertToUnsignedVec(MBB, *TII); } @@ -1437,10 +1498,22 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) { // the user how the outliner is running. LLVM_DEBUG({ dbgs() << "Machine Outliner: Running on "; - if (RunOnAllFunctions) + switch (RunOutlinerMode) { + case RunOutliner::AlwaysOutline: dbgs() << "all functions"; - else + break; + case RunOutliner::OptimisticPGO: + dbgs() << "optimistically cold functions"; + break; + case RunOutliner::ConservativePGO: + dbgs() << "conservatively cold functions"; + break; + case RunOutliner::TargetDefault: dbgs() << "target-default functions"; + break; + case RunOutliner::NeverOutline: + llvm_unreachable("should not outline"); + } dbgs() << "\n"; }); diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index b7135251781a..abb3f3e61200 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -432,6 +432,11 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const { return hasSingleElement(use_nodbg_instructions(RegNo)); } +MachineOperand *MachineRegisterInfo::getOneNonDBGUse(Register RegNo) const { + auto RegNoDbgUses = use_nodbg_operands(RegNo); + return hasSingleElement(RegNoDbgUses) ? &*RegNoDbgUses.begin() : nullptr; +} + MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const { auto RegNoDbgUsers = use_nodbg_instructions(RegNo); return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr; diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 96c9cde622b4..f54e2f264556 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -507,83 +507,86 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { }); break; case Intrinsic::objc_autorelease: - Changed |= lowerObjCCall(F, RTLIB::objc_autorelease); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_autorelease); break; case Intrinsic::objc_autoreleasePoolPop: - Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPop); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleasePoolPop); break; case Intrinsic::objc_autoreleasePoolPush: - Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPush); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleasePoolPush); break; case Intrinsic::objc_autoreleaseReturnValue: - Changed |= lowerObjCCall(F, RTLIB::objc_autoreleaseReturnValue); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleaseReturnValue); break; case Intrinsic::objc_copyWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_copyWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_copyWeak); break; case Intrinsic::objc_destroyWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_destroyWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_destroyWeak); break; case Intrinsic::objc_initWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_initWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_initWeak); break; case Intrinsic::objc_loadWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_loadWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_loadWeak); break; case Intrinsic::objc_loadWeakRetained: - Changed |= lowerObjCCall(F, RTLIB::objc_loadWeakRetained); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_loadWeakRetained); break; case Intrinsic::objc_moveWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_moveWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_moveWeak); break; case Intrinsic::objc_release: - Changed |= lowerObjCCall(F, RTLIB::objc_release, true); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_release, true); break; case Intrinsic::objc_retain: - Changed |= lowerObjCCall(F, RTLIB::objc_retain, true); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retain, true); break; case Intrinsic::objc_retainAutorelease: - Changed |= lowerObjCCall(F, RTLIB::objc_retainAutorelease); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainAutorelease); break; case Intrinsic::objc_retainAutoreleaseReturnValue: - Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleaseReturnValue); + Changed |= + lowerObjCCall(F, RTLIB::impl_objc_retainAutoreleaseReturnValue); break; case Intrinsic::objc_retainAutoreleasedReturnValue: - Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleasedReturnValue); + Changed |= + lowerObjCCall(F, RTLIB::impl_objc_retainAutoreleasedReturnValue); break; case Intrinsic::objc_claimAutoreleasedReturnValue: - Changed |= lowerObjCCall(F, RTLIB::objc_claimAutoreleasedReturnValue); + Changed |= + lowerObjCCall(F, RTLIB::impl_objc_claimAutoreleasedReturnValue); break; case Intrinsic::objc_retainBlock: - Changed |= lowerObjCCall(F, RTLIB::objc_retainBlock); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainBlock); break; case Intrinsic::objc_storeStrong: - Changed |= lowerObjCCall(F, RTLIB::objc_storeStrong); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_storeStrong); break; case Intrinsic::objc_storeWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_storeWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_storeWeak); break; case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue: Changed |= - lowerObjCCall(F, RTLIB::objc_unsafeClaimAutoreleasedReturnValue); + lowerObjCCall(F, RTLIB::impl_objc_unsafeClaimAutoreleasedReturnValue); break; case Intrinsic::objc_retainedObject: - Changed |= lowerObjCCall(F, RTLIB::objc_retainedObject); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainedObject); break; case Intrinsic::objc_unretainedObject: - Changed |= lowerObjCCall(F, RTLIB::objc_unretainedObject); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_unretainedObject); break; case Intrinsic::objc_unretainedPointer: - Changed |= lowerObjCCall(F, RTLIB::objc_unretainedPointer); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_unretainedPointer); break; case Intrinsic::objc_retain_autorelease: - Changed |= lowerObjCCall(F, RTLIB::objc_retain_autorelease); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retain_autorelease); break; case Intrinsic::objc_sync_enter: - Changed |= lowerObjCCall(F, RTLIB::objc_sync_enter); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_sync_enter); break; case Intrinsic::objc_sync_exit: - Changed |= lowerObjCCall(F, RTLIB::objc_sync_exit); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_sync_exit); break; case Intrinsic::exp: case Intrinsic::exp2: diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index 415674231b5c..a589ef761dd7 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -275,7 +275,6 @@ void ReachingDefAnalysis::printAllReachingDefs(MachineFunction &MF) { bool ReachingDefAnalysis::runOnMachineFunction(MachineFunction &mf) { MF = &mf; - TRI = MF->getSubtarget().getRegisterInfo(); const TargetSubtargetInfo &STI = MF->getSubtarget(); TRI = STI.getRegisterInfo(); TII = STI.getInstrInfo(); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 27b5a0d37b67..d130efe96b56 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4710,7 +4710,10 @@ template <class MatchContextClass> SDValue DAGCombiner::visitMUL(SDNode *N) { if (SDValue LogBase2 = BuildLogBase2(N1, DL)) { EVT ShiftVT = getShiftAmountTy(N0.getValueType()); SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); - return Matcher.getNode(ISD::SHL, DL, VT, N0, Trunc); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap()); + // TODO: Preserve setNoSignedWrap if LogBase2 isn't BitWidth - 1. + return Matcher.getNode(ISD::SHL, DL, VT, N0, Trunc, Flags); } } @@ -9998,13 +10001,16 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { } } - // fold (not (neg x)) -> (add X, -1) - // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if - // Y is a constant or the subtract has a single use. - if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB && - isNullConstant(N0.getOperand(0))) { - return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), - DAG.getAllOnesConstant(DL, VT)); + // fold (not (sub Y, X)) -> (add X, ~Y) if Y is a constant + if (N0.getOpcode() == ISD::SUB && isAllOnesConstant(N1)) { + SDValue Y = N0.getOperand(0); + SDValue X = N0.getOperand(1); + + if (auto *YConst = dyn_cast<ConstantSDNode>(Y)) { + APInt NotYValue = ~YConst->getAPIntValue(); + SDValue NotY = DAG.getConstant(NotYValue, DL, VT); + return DAG.getNode(ISD::ADD, DL, VT, X, NotY, N->getFlags()); + } } // fold (not (add X, -1)) -> (neg X) @@ -11089,38 +11095,43 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { } } - // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or - // (and (srl x, (sub c2, c1), MASK) - if (N0.getOpcode() == ISD::SHL && - (N0.getOperand(1) == N1 || N0->hasOneUse()) && - TLI.shouldFoldConstantShiftPairToMask(N, Level)) { - auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS, - ConstantSDNode *RHS) { - const APInt &LHSC = LHS->getAPIntValue(); - const APInt &RHSC = RHS->getAPIntValue(); - return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) && - LHSC.getZExtValue() <= RHSC.getZExtValue(); - }; - if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount, - /*AllowUndefs*/ false, - /*AllowTypeMismatch*/ true)) { - SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); - SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1); - SDValue Mask = DAG.getAllOnesConstant(DL, VT); - Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01); - Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff); - SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff); - return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); - } - if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount, - /*AllowUndefs*/ false, - /*AllowTypeMismatch*/ true)) { - SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); - SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01); - SDValue Mask = DAG.getAllOnesConstant(DL, VT); - Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1); - SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff); - return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); + if (N0.getOpcode() == ISD::SHL) { + // fold (srl (shl nuw x, c), c) -> x + if (N0.getOperand(1) == N1 && N0->getFlags().hasNoUnsignedWrap()) + return N0.getOperand(0); + + // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or + // (and (srl x, (sub c2, c1), MASK) + if ((N0.getOperand(1) == N1 || N0->hasOneUse()) && + TLI.shouldFoldConstantShiftPairToMask(N, Level)) { + auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + const APInt &LHSC = LHS->getAPIntValue(); + const APInt &RHSC = RHS->getAPIntValue(); + return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) && + LHSC.getZExtValue() <= RHSC.getZExtValue(); + }; + if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); + Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01); + Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff); + SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff); + return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); + } + if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); + Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1); + SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff); + return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); + } } } @@ -15137,7 +15148,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { return foldedExt; } else if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && - TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { + TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) { bool DoXform = true; SmallVector<SDNode *, 4> SetCCs; if (!N0.hasOneUse()) @@ -16309,7 +16320,15 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) { SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0)); SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1)); - return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR); + SDNodeFlags Flags; + // Propagate nuw for sub. + if (N0->getOpcode() == ISD::SUB && N0->getFlags().hasNoUnsignedWrap() && + DAG.MaskedValueIsZero( + N0->getOperand(0), + APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(), + VT.getScalarSizeInBits()))) + Flags.setNoUnsignedWrap(true); + return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR, Flags); } } break; @@ -16788,6 +16807,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // If we have frozen and unfrozen users of N0, update so everything uses N. if (!N0.isUndef() && !N0.hasOneUse()) { SDValue FrozenN0(N, 0); + // Unfreeze all uses of N to avoid double deleting N from the CSE map. + DAG.ReplaceAllUsesOfValueWith(FrozenN0, N0); DAG.ReplaceAllUsesOfValueWith(N0, FrozenN0); // ReplaceAllUsesOfValueWith will have also updated the use in N, thus // creating a cycle in a DAG. Let's undo that by mutating the freeze. @@ -19346,13 +19367,13 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { // MachineBasicBlock CFG, which is awkward. // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal - // on the target. + // on the target, also copy fast math flags. if (N1.getOpcode() == ISD::SETCC && TLI.isOperationLegalOrCustom(ISD::BR_CC, N1.getOperand(0).getValueType())) { - return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, - Chain, N1.getOperand(2), - N1.getOperand(0), N1.getOperand(1), N2); + return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, Chain, + N1.getOperand(2), N1.getOperand(0), N1.getOperand(1), N2, + N1->getFlags()); } if (N1.hasOneUse()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 1a63518ab37a..861f76e93f2c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -238,7 +238,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, // Create the result registers for this node and add the result regs to // the machine instruction. - if (VRBase == 0) { + if (!VRBase) { assert(RC && "Isn't a register operand!"); VRBase = MRI->createVirtualRegister(RC); MIB.addReg(VRBase, RegState::Define); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 90d62e6da8e9..9e85f08abb76 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -324,6 +324,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VP_REDUCE(N); break; + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: + Res = PromoteIntRes_LOOP_DEPENDENCE_MASK(N); + break; + case ISD::FREEZE: Res = PromoteIntRes_FREEZE(N); break; @@ -374,6 +379,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N, return GetPromotedInteger(Op); } +SDValue DAGTypeLegalizer::PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N) { + EVT VT = N->getValueType(0); + EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + return DAG.getNode(N->getOpcode(), SDLoc(N), NewVT, N->ops()); +} + SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) { // Sign-extend the new bits, and continue the assertion. SDValue Op = SExtPromotedInteger(N->getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 65fd863e55ac..586c3411791f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -382,6 +382,7 @@ private: SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N); + SDValue PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -436,6 +437,7 @@ private: SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N); + SDValue PromoteIntOp_LOOP_DEPENDENCE_MASK(SDNode *N, unsigned OpNo); void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -868,6 +870,7 @@ private: // Vector Result Scalarization: <1 x ty> -> ty. void ScalarizeVectorResult(SDNode *N, unsigned ResNo); SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo); + SDValue ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N); SDValue ScalarizeVecRes_BinOp(SDNode *N); SDValue ScalarizeVecRes_CMP(SDNode *N); SDValue ScalarizeVecRes_TernaryOp(SDNode *N); @@ -964,6 +967,7 @@ private: void SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -1070,6 +1074,7 @@ private: SDValue WidenVecRes_ADDRSPACECAST(SDNode *N); SDValue WidenVecRes_AssertZext(SDNode* N); SDValue WidenVecRes_BITCAST(SDNode* N); + SDValue WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N); SDValue WidenVecRes_BUILD_VECTOR(SDNode* N); SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N); SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 2ca98958fde0..8e423c4f83b3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -138,6 +138,7 @@ class VectorLegalizer { SDValue ExpandVP_FNEG(SDNode *Node); SDValue ExpandVP_FABS(SDNode *Node); SDValue ExpandVP_FCOPYSIGN(SDNode *Node); + SDValue ExpandLOOP_DEPENDENCE_MASK(SDNode *N); SDValue ExpandSELECT(SDNode *Node); std::pair<SDValue, SDValue> ExpandLoad(SDNode *N); SDValue ExpandStore(SDNode *N); @@ -475,6 +476,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECTOR_COMPRESS: case ISD::SCMP: case ISD::UCMP: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; case ISD::SMULFIX: @@ -1291,6 +1294,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { case ISD::UCMP: Results.push_back(TLI.expandCMP(Node, DAG)); return; + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: + Results.push_back(ExpandLOOP_DEPENDENCE_MASK(Node)); + return; case ISD::FADD: case ISD::FMUL: @@ -1796,6 +1803,50 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) { return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign); } +SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) { + SDLoc DL(N); + SDValue SourceValue = N->getOperand(0); + SDValue SinkValue = N->getOperand(1); + SDValue EltSize = N->getOperand(2); + + bool IsReadAfterWrite = N->getOpcode() == ISD::LOOP_DEPENDENCE_RAW_MASK; + EVT VT = N->getValueType(0); + EVT PtrVT = SourceValue->getValueType(0); + + SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue); + if (IsReadAfterWrite) + Diff = DAG.getNode(ISD::ABS, DL, PtrVT, Diff); + + Diff = DAG.getNode(ISD::SDIV, DL, PtrVT, Diff, EltSize); + + // If the difference is positive then some elements may alias + EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + Diff.getValueType()); + SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT); + SDValue Cmp = DAG.getSetCC(DL, CmpVT, Diff, Zero, + IsReadAfterWrite ? ISD::SETEQ : ISD::SETLE); + + // Create the lane mask + EVT SplatVT = VT.changeElementType(PtrVT); + SDValue DiffSplat = DAG.getSplat(SplatVT, DL, Diff); + SDValue VectorStep = DAG.getStepVector(DL, SplatVT); + EVT MaskVT = VT.changeElementType(MVT::i1); + SDValue DiffMask = + DAG.getSetCC(DL, MaskVT, VectorStep, DiffSplat, ISD::CondCode::SETULT); + + EVT EltVT = VT.getVectorElementType(); + // Extend the diff setcc in case the intrinsic has been promoted to a vector + // type with elements larger than i1 + if (EltVT.getScalarSizeInBits() > MaskVT.getScalarSizeInBits()) + DiffMask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, DiffMask); + + // Splat the compare result then OR it with the lane mask + if (CmpVT.getScalarSizeInBits() < EltVT.getScalarSizeInBits()) + Cmp = DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Cmp); + SDValue Splat = DAG.getSplat(VT, DL, Cmp); + return DAG.getNode(ISD::OR, DL, VT, DiffMask, Splat); +} + void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node, SmallVectorImpl<SDValue> &Results) { // Attempt to expand using TargetLowering. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 10e3a5149a5d..118fd8418f78 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -53,6 +53,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { report_fatal_error("Do not know how to scalarize the result of this " "operator!\n"); + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: + R = ScalarizeVecRes_LOOP_DEPENDENCE_MASK(N); + break; case ISD::MERGE_VALUES: R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break; case ISD::BITCAST: R = ScalarizeVecRes_BITCAST(N); break; case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break; @@ -396,6 +400,22 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N, return GetScalarizedVector(Op); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) { + SDValue SourceValue = N->getOperand(0); + SDValue SinkValue = N->getOperand(1); + SDValue EltSize = N->getOperand(2); + EVT PtrVT = SourceValue->getValueType(0); + SDLoc DL(N); + + SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue); + EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + Diff.getValueType()); + SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT); + return DAG.getNode(ISD::OR, DL, CmpVT, + DAG.getSetCC(DL, CmpVT, Diff, EltSize, ISD::SETGE), + DAG.getSetCC(DL, CmpVT, Diff, Zero, ISD::SETEQ)); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) { SDValue Op = N->getOperand(0); if (getTypeAction(Op.getValueType()) == TargetLowering::TypeScalarizeVector) @@ -1159,6 +1179,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { report_fatal_error("Do not know how to split the result of this " "operator!\n"); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + SplitVecRes_LOOP_DEPENDENCE_MASK(N, Lo, Hi); + break; case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::AssertZext: SplitVecRes_AssertZext(N, Lo, Hi); break; case ISD::VSELECT: @@ -1652,6 +1676,25 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); } +void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc DL(N); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + SDValue PtrA = N->getOperand(0); + SDValue PtrB = N->getOperand(1); + Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB, N->getOperand(2)); + + unsigned EltSize = N->getConstantOperandVal(2); + unsigned Offset = EltSize * HiVT.getVectorMinNumElements(); + SDValue Addend = HiVT.isScalableVT() + ? DAG.getVScale(DL, MVT::i64, APInt(64, Offset)) + : DAG.getConstant(Offset, DL, MVT::i64); + + PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend); + Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2)); +} + void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT LoVT, HiVT; @@ -2517,10 +2560,10 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo, else std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, dl); + MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - N->getPointerInfo(), MachineMemOperand::MOLoad, - LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(), - N->getRanges()); + N->getPointerInfo(), MMOFlags, LocationSize::beforeOrAfterPointer(), + Alignment, N->getAAInfo(), N->getRanges()); if (auto *MGT = dyn_cast<MaskedGatherSDNode>(N)) { SDValue PassThru = MGT->getPassThru(); @@ -4321,10 +4364,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) { std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, DL); SDValue Lo; + MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - N->getPointerInfo(), MachineMemOperand::MOStore, - LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(), - N->getRanges()); + N->getPointerInfo(), MMOFlags, LocationSize::beforeOrAfterPointer(), + Alignment, N->getAAInfo(), N->getRanges()); if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) { SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale}; @@ -4784,6 +4827,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { #endif report_fatal_error("Do not know how to widen the result of this operator!"); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + Res = WidenVecRes_LOOP_DEPENDENCE_MASK(N); + break; case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break; case ISD::ADDRSPACECAST: Res = WidenVecRes_ADDRSPACECAST(N); @@ -5986,6 +6033,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { return CreateStackStoreLoad(InOp, WidenVT); } +SDValue DAGTypeLegalizer::WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) { + return DAG.getNode( + N->getOpcode(), SDLoc(N), + TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)), + N->getOperand(0), N->getOperand(1), N->getOperand(2)); +} + SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { SDLoc dl(N); // Build a vector with undefined for the new nodes. diff --git a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index 0a449fd011e6..72ea0898f975 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -63,6 +63,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) HorizontalVerticalBalance = 0; } +ResourcePriorityQueue::~ResourcePriorityQueue() = default; + unsigned ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) { unsigned NumberDeps = 0; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 3672a91e33a3..bcf25958d098 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3299,7 +3299,7 @@ SelectionDAG::getValidShiftAmountRange(SDValue V, const APInt &DemandedElts, return std::nullopt; } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth) const { assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL || @@ -3312,7 +3312,7 @@ SelectionDAG::getValidShiftAmount(SDValue V, const APInt &DemandedElts, return std::nullopt; } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidShiftAmount(SDValue V, unsigned Depth) const { EVT VT = V.getValueType(); APInt DemandedElts = VT.isFixedLengthVector() @@ -3321,7 +3321,7 @@ SelectionDAG::getValidShiftAmount(SDValue V, unsigned Depth) const { return getValidShiftAmount(V, DemandedElts, Depth); } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth) const { assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL || @@ -3333,7 +3333,7 @@ SelectionDAG::getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, return std::nullopt; } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidMinimumShiftAmount(SDValue V, unsigned Depth) const { EVT VT = V.getValueType(); APInt DemandedElts = VT.isFixedLengthVector() @@ -3342,7 +3342,7 @@ SelectionDAG::getValidMinimumShiftAmount(SDValue V, unsigned Depth) const { return getValidMinimumShiftAmount(V, DemandedElts, Depth); } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidMaximumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth) const { assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL || @@ -3354,7 +3354,7 @@ SelectionDAG::getValidMaximumShiftAmount(SDValue V, const APInt &DemandedElts, return std::nullopt; } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidMaximumShiftAmount(SDValue V, unsigned Depth) const { EVT VT = V.getValueType(); APInt DemandedElts = VT.isFixedLengthVector() @@ -3828,7 +3828,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known = KnownBits::shl(Known, Known2, NUW, NSW, ShAmtNonZero); // Minimum shift low bits are known zero. - if (std::optional<uint64_t> ShMinAmt = + if (std::optional<unsigned> ShMinAmt = getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1)) Known.Zero.setLowBits(*ShMinAmt); break; @@ -3840,7 +3840,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Op->getFlags().hasExact()); // Minimum shift high bits are known zero. - if (std::optional<uint64_t> ShMinAmt = + if (std::optional<unsigned> ShMinAmt = getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1)) Known.Zero.setHighBits(*ShMinAmt); break; @@ -3850,6 +3850,22 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known = KnownBits::ashr(Known, Known2, /*ShAmtNonZero=*/false, Op->getFlags().hasExact()); break; + case ISD::ROTL: + case ISD::ROTR: + if (ConstantSDNode *C = + isConstOrConstSplat(Op.getOperand(1), DemandedElts)) { + unsigned Amt = C->getAPIntValue().urem(BitWidth); + + Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + // Canonicalize to ROTR. + if (Opcode == ISD::ROTL && Amt != 0) + Amt = BitWidth - Amt; + + Known.Zero = Known.Zero.rotr(Amt); + Known.One = Known.One.rotr(Amt); + } + break; case ISD::FSHL: case ISD::FSHR: if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(2), DemandedElts)) { @@ -3868,15 +3884,11 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); if (Opcode == ISD::FSHL) { - Known.One <<= Amt; - Known.Zero <<= Amt; - Known2.One.lshrInPlace(BitWidth - Amt); - Known2.Zero.lshrInPlace(BitWidth - Amt); + Known <<= Amt; + Known2 >>= BitWidth - Amt; } else { - Known.One <<= BitWidth - Amt; - Known.Zero <<= BitWidth - Amt; - Known2.One.lshrInPlace(Amt); - Known2.Zero.lshrInPlace(Amt); + Known <<= BitWidth - Amt; + Known2 >>= Amt; } Known = Known.unionWith(Known2); } @@ -4875,15 +4887,15 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, case ISD::SRA: Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); // SRA X, C -> adds C sign bits. - if (std::optional<uint64_t> ShAmt = + if (std::optional<unsigned> ShAmt = getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1)) - Tmp = std::min<uint64_t>(Tmp + *ShAmt, VTBits); + Tmp = std::min(Tmp + *ShAmt, VTBits); return Tmp; case ISD::SHL: if (std::optional<ConstantRange> ShAmtRange = getValidShiftAmountRange(Op, DemandedElts, Depth + 1)) { - uint64_t MaxShAmt = ShAmtRange->getUnsignedMax().getZExtValue(); - uint64_t MinShAmt = ShAmtRange->getUnsignedMin().getZExtValue(); + unsigned MaxShAmt = ShAmtRange->getUnsignedMax().getZExtValue(); + unsigned MinShAmt = ShAmtRange->getUnsignedMin().getZExtValue(); // Try to look through ZERO/SIGN/ANY_EXTEND. If all extended bits are // shifted out, then we can compute the number of sign bits for the // operand being extended. A future improvement could be to pass along the @@ -4894,7 +4906,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, EVT ExtVT = Ext.getValueType(); SDValue Extendee = Ext.getOperand(0); EVT ExtendeeVT = Extendee.getValueType(); - uint64_t SizeDifference = + unsigned SizeDifference = ExtVT.getScalarSizeInBits() - ExtendeeVT.getScalarSizeInBits(); if (SizeDifference <= MinShAmt) { Tmp = SizeDifference + @@ -5127,7 +5139,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, // If the sign portion ends in our element the subtraction gives correct // result. Otherwise it gives either negative or > bitwidth result - return std::clamp(KnownSign - rIndex * BitWidth, 0, BitWidth); + return std::clamp(KnownSign - rIndex * BitWidth, 1, BitWidth); } case ISD::INSERT_VECTOR_ELT: { if (VT.isScalableVector()) @@ -5660,6 +5672,10 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::USUBSAT: case ISD::MULHU: case ISD::MULHS: + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: + case ISD::AVGCEILS: + case ISD::AVGCEILU: case ISD::ABDU: case ISD::ABDS: case ISD::SMIN: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 62ba801f6992..430e47451fd4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7974,12 +7974,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } case Intrinsic::amdgcn_call_whole_wave: { TargetLowering::ArgListTy Args; + bool isTailCall = I.isTailCall(); // The first argument is the callee. Skip it when assembling the call args. for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) { TargetLowering::ArgListEntry Arg(getValue(I.getArgOperand(Idx)), I.getArgOperand(Idx)->getType()); Arg.setAttributes(&I, Idx); + + // If we have an explicit sret argument that is an Instruction, (i.e., it + // might point to function-local memory), we can't meaningfully tail-call. + if (Arg.IsSRet && isa<Instruction>(I.getArgOperand(Idx))) + isTailCall = false; + Args.push_back(Arg); } @@ -7994,7 +8001,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, .setChain(getRoot()) .setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(), getValue(I.getArgOperand(0)), std::move(Args)) - .setTailCall(false) + .setTailCall(isTailCall && canTailCall(I)) .setIsPreallocated( I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0) .setConvergent(I.isConvergent()) @@ -8295,6 +8302,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, visitVectorExtractLastActive(I, Intrinsic); return; } + case Intrinsic::loop_dependence_war_mask: + setValue(&I, + DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, sdl, + EVT::getEVT(I.getType()), getValue(I.getOperand(0)), + getValue(I.getOperand(1)), getValue(I.getOperand(2)))); + return; + case Intrinsic::loop_dependence_raw_mask: + setValue(&I, + DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, sdl, + EVT::getEVT(I.getType()), getValue(I.getOperand(0)), + getValue(I.getOperand(1)), getValue(I.getOperand(2)))); + return; } } @@ -8456,8 +8475,11 @@ void SelectionDAGBuilder::visitVPLoad( MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, + MachinePointerInfo(PtrOperand), MMOFlags, LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges); LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2], MMO, false /*IsExpanding */); @@ -8508,9 +8530,11 @@ void SelectionDAGBuilder::visitVPGather( Alignment = DAG.getEVTAlign(VT.getScalarType()); unsigned AS = PtrOperand->getType()->getScalarType()->getPointerAddressSpace(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(AS), MachineMemOperand::MOLoad, - LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges); + MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(), + *Alignment, AAInfo, Ranges); SDValue Base, Index, Scale; bool UniformBase = getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(), @@ -8546,8 +8570,11 @@ void SelectionDAGBuilder::visitVPStore( Alignment = DAG.getEVTAlign(VT); SDValue Ptr = OpValues[1]; SDValue Offset = DAG.getUNDEF(Ptr.getValueType()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore, + MachinePointerInfo(PtrOperand), MMOFlags, LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo); ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset, OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED, @@ -8569,9 +8596,11 @@ void SelectionDAGBuilder::visitVPScatter( Alignment = DAG.getEVTAlign(VT.getScalarType()); unsigned AS = PtrOperand->getType()->getScalarType()->getPointerAddressSpace(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(AS), MachineMemOperand::MOStore, - LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo); + MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(), + *Alignment, AAInfo); SDValue Base, Index, Scale; bool UniformBase = getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(), @@ -8609,9 +8638,12 @@ void SelectionDAGBuilder::visitVPStridedLoad( bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); unsigned AS = PtrOperand->getType()->getPointerAddressSpace(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(AS), MachineMemOperand::MOLoad, - LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges); + MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(), + *Alignment, AAInfo, Ranges); SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2], OpValues[3], MMO, @@ -8632,9 +8664,12 @@ void SelectionDAGBuilder::visitVPStridedStore( Alignment = DAG.getEVTAlign(VT.getScalarType()); AAMDNodes AAInfo = VPIntrin.getAAMetadata(); unsigned AS = PtrOperand->getType()->getPointerAddressSpace(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(AS), MachineMemOperand::MOStore, - LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo); + MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(), + *Alignment, AAInfo); SDValue ST = DAG.getStridedStoreVP( getMemoryRoot(), DL, OpValues[0], OpValues[1], @@ -8901,6 +8936,29 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, return Result; } +bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const { + bool isMustTailCall = CB.isMustTailCall(); + + // Avoid emitting tail calls in functions with the disable-tail-calls + // attribute. + const Function *Caller = CB.getParent()->getParent(); + if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() == + "true" && + !isMustTailCall) + return false; + + // We can't tail call inside a function with a swifterror argument. Lowering + // does not support this yet. It would have to move into the swifterror + // register before the call. + if (DAG.getTargetLoweringInfo().supportSwiftError() && + Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return false; + + // Check if target-independent constraints permit a tail call here. + // Target-dependent constraints are checked within TLI->LowerCallTo. + return isInTailCallPosition(CB, DAG.getTarget()); +} + void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, bool isTailCall, bool isMustTailCall, const BasicBlock *EHPadBB, @@ -8915,21 +8973,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, const Value *SwiftErrorVal = nullptr; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (isTailCall) { - // Avoid emitting tail calls in functions with the disable-tail-calls - // attribute. - auto *Caller = CB.getParent()->getParent(); - if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() == - "true" && !isMustTailCall) - isTailCall = false; - - // We can't tail call inside a function with a swifterror argument. Lowering - // does not support this yet. It would have to move into the swifterror - // register before the call. - if (TLI.supportSwiftError() && - Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) - isTailCall = false; - } + if (isTailCall) + isTailCall = canTailCall(CB); for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) { const Value *V = *I; @@ -8969,11 +9014,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, Args.push_back(Entry); } - // Check if target-independent constraints permit a tail call here. - // Target-dependent constraints are checked within TLI->LowerCallTo. - if (isTailCall && !isInTailCallPosition(CB, DAG.getTarget())) - isTailCall = false; - // Disable tail calls if there is an swifterror argument. Targets have not // been updated to support tail calls. if (TLI.supportSwiftError() && SwiftErrorVal) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index e0835e631035..c7577fa335fe 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -408,6 +408,10 @@ public: bool IsMustTailCall, const BasicBlock *EHPadBB = nullptr, const TargetLowering::PtrAuthInfo *PAI = nullptr); + // Check some of the target-independent constraints for tail calls. This does + // not iterate over the call arguments. + bool canTailCall(const CallBase &CB) const; + // Lower range metadata from 0 to N to assert zext to an integer of nearest // floor power of two. SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 900da7645504..4b2a00c2e2cf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -587,6 +587,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { return "partial_reduce_smla"; case ISD::PARTIAL_REDUCE_SUMLA: return "partial_reduce_sumla"; + case ISD::LOOP_DEPENDENCE_WAR_MASK: + return "loop_dep_war"; + case ISD::LOOP_DEPENDENCE_RAW_MASK: + return "loop_dep_raw"; // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index ece50ed95fc4..e61558c59bf0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1729,10 +1729,18 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // Setup an EH landing-pad block. FuncInfo->ExceptionPointerVirtReg = Register(); FuncInfo->ExceptionSelectorVirtReg = Register(); - if (LLVMBB->isEHPad()) + if (LLVMBB->isEHPad()) { if (!PrepareEHLandingPad()) continue; + if (!FastIS) { + SDValue NewRoot = TLI->lowerEHPadEntry(CurDAG->getRoot(), + SDB->getCurSDLoc(), *CurDAG); + if (NewRoot && NewRoot != CurDAG->getRoot()) + CurDAG->setRoot(NewRoot); + } + } + // Before doing SelectionDAG ISel, see if FastISel has been requested. if (FastIS) { if (LLVMBB != &Fn.getEntryBlock()) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 402a012e8e55..fd6d20e146bb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -832,7 +832,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( case ISD::SHL: { // If we are only demanding sign bits then we can use the shift source // directly. - if (std::optional<uint64_t> MaxSA = + if (std::optional<unsigned> MaxSA = DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { SDValue Op0 = Op.getOperand(0); unsigned ShAmt = *MaxSA; @@ -847,7 +847,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( case ISD::SRL: { // If we are only demanding sign bits then we can use the shift source // directly. - if (std::optional<uint64_t> MaxSA = + if (std::optional<unsigned> MaxSA = DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { SDValue Op0 = Op.getOperand(0); unsigned ShAmt = *MaxSA; @@ -1780,7 +1780,7 @@ bool TargetLowering::SimplifyDemandedBits( SDValue Op1 = Op.getOperand(1); EVT ShiftVT = Op1.getValueType(); - if (std::optional<uint64_t> KnownSA = + if (std::optional<unsigned> KnownSA = TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *KnownSA; if (ShAmt == 0) @@ -1792,7 +1792,7 @@ bool TargetLowering::SimplifyDemandedBits( // TODO - support non-uniform vector amounts. if (Op0.getOpcode() == ISD::SRL) { if (!DemandedBits.intersects(APInt::getLowBitsSet(BitWidth, ShAmt))) { - if (std::optional<uint64_t> InnerSA = + if (std::optional<unsigned> InnerSA = TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) { unsigned C1 = *InnerSA; unsigned Opc = ISD::SHL; @@ -1832,7 +1832,7 @@ bool TargetLowering::SimplifyDemandedBits( // TODO - support non-uniform vector amounts. if (InnerOp.getOpcode() == ISD::SRL && Op0.hasOneUse() && InnerOp.hasOneUse()) { - if (std::optional<uint64_t> SA2 = TLO.DAG.getValidShiftAmount( + if (std::optional<unsigned> SA2 = TLO.DAG.getValidShiftAmount( InnerOp, DemandedElts, Depth + 2)) { unsigned InnerShAmt = *SA2; if (InnerShAmt < ShAmt && InnerShAmt < InnerBits && @@ -1858,8 +1858,7 @@ bool TargetLowering::SimplifyDemandedBits( Op->dropFlags(SDNodeFlags::NoWrap); return true; } - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + Known <<= ShAmt; // low bits known zero. Known.Zero.setLowBits(ShAmt); @@ -1950,7 +1949,7 @@ bool TargetLowering::SimplifyDemandedBits( // If we are only demanding sign bits then we can use the shift source // directly. - if (std::optional<uint64_t> MaxSA = + if (std::optional<unsigned> MaxSA = TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *MaxSA; unsigned NumSignBits = @@ -1966,7 +1965,7 @@ bool TargetLowering::SimplifyDemandedBits( SDValue Op1 = Op.getOperand(1); EVT ShiftVT = Op1.getValueType(); - if (std::optional<uint64_t> KnownSA = + if (std::optional<unsigned> KnownSA = TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *KnownSA; if (ShAmt == 0) @@ -1978,7 +1977,7 @@ bool TargetLowering::SimplifyDemandedBits( // TODO - support non-uniform vector amounts. if (Op0.getOpcode() == ISD::SHL) { if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) { - if (std::optional<uint64_t> InnerSA = + if (std::optional<unsigned> InnerSA = TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) { unsigned C1 = *InnerSA; unsigned Opc = ISD::SRL; @@ -1998,7 +1997,7 @@ bool TargetLowering::SimplifyDemandedBits( // single sra. We can do this if the top bits are never demanded. if (Op0.getOpcode() == ISD::SRA && Op0.hasOneUse()) { if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) { - if (std::optional<uint64_t> InnerSA = + if (std::optional<unsigned> InnerSA = TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) { unsigned C1 = *InnerSA; // Clamp the combined shift amount if it exceeds the bit width. @@ -2042,8 +2041,7 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // High bits known zero. Known.Zero.setHighBits(ShAmt); @@ -2064,7 +2062,7 @@ bool TargetLowering::SimplifyDemandedBits( // If we are only demanding sign bits then we can use the shift source // directly. - if (std::optional<uint64_t> MaxSA = + if (std::optional<unsigned> MaxSA = TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *MaxSA; // Must already be signbits in DemandedBits bounds, and can't demand any @@ -2103,7 +2101,7 @@ bool TargetLowering::SimplifyDemandedBits( if (DemandedBits.isOne()) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1)); - if (std::optional<uint64_t> KnownSA = + if (std::optional<unsigned> KnownSA = TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *KnownSA; if (ShAmt == 0) @@ -2112,7 +2110,7 @@ bool TargetLowering::SimplifyDemandedBits( // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target // supports sext_inreg. if (Op0.getOpcode() == ISD::SHL) { - if (std::optional<uint64_t> InnerSA = + if (std::optional<unsigned> InnerSA = TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) { unsigned LowBits = BitWidth - ShAmt; EVT ExtVT = EVT::getIntegerVT(*TLO.DAG.getContext(), LowBits); @@ -2153,8 +2151,7 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. @@ -2225,10 +2222,8 @@ bool TargetLowering::SimplifyDemandedBits( Depth + 1)) return true; - Known2.One <<= (IsFSHL ? Amt : (BitWidth - Amt)); - Known2.Zero <<= (IsFSHL ? Amt : (BitWidth - Amt)); - Known.One.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt); - Known.Zero.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt); + Known2 <<= (IsFSHL ? Amt : (BitWidth - Amt)); + Known >>= (IsFSHL ? (BitWidth - Amt) : Amt); Known = Known.unionWith(Known2); // Attempt to avoid multi-use ops if we don't need anything from them. @@ -2363,8 +2358,7 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO, Depth + 1)) return true; - Known.One = Known2.One.reverseBits(); - Known.Zero = Known2.Zero.reverseBits(); + Known = Known2.reverseBits(); break; } case ISD::BSWAP: { @@ -2397,8 +2391,7 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO, Depth + 1)) return true; - Known.One = Known2.One.byteSwap(); - Known.Zero = Known2.Zero.byteSwap(); + Known = Known2.byteSwap(); break; } case ISD::CTPOP: { @@ -2664,11 +2657,11 @@ bool TargetLowering::SimplifyDemandedBits( break; } - std::optional<uint64_t> ShAmtC = + std::optional<unsigned> ShAmtC = TLO.DAG.getValidShiftAmount(Src, DemandedElts, Depth + 2); if (!ShAmtC || *ShAmtC >= BitWidth) break; - uint64_t ShVal = *ShAmtC; + unsigned ShVal = *ShAmtC; APInt HighBits = APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth); @@ -3234,27 +3227,6 @@ bool TargetLowering::SimplifyDemandedVectorElts( KnownUndef.setAllBits(); return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); } - SDValue ScalarSrc = Op.getOperand(0); - if (ScalarSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - SDValue Src = ScalarSrc.getOperand(0); - SDValue Idx = ScalarSrc.getOperand(1); - EVT SrcVT = Src.getValueType(); - - ElementCount SrcEltCnt = SrcVT.getVectorElementCount(); - - if (SrcEltCnt.isScalable()) - return false; - - unsigned NumSrcElts = SrcEltCnt.getFixedValue(); - if (isNullConstant(Idx)) { - APInt SrcDemandedElts = APInt::getOneBitSet(NumSrcElts, 0); - APInt SrcUndef = KnownUndef.zextOrTrunc(NumSrcElts); - APInt SrcZero = KnownZero.zextOrTrunc(NumSrcElts); - if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero, - TLO, Depth + 1)) - return true; - } - } KnownUndef.setHighBits(NumElts - 1); break; } @@ -9740,8 +9712,8 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG, SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const { SDLoc dl(N); EVT VT = N->getValueType(0); - SDValue LHS = DAG.getFreeze(N->getOperand(0)); - SDValue RHS = DAG.getFreeze(N->getOperand(1)); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); bool IsSigned = N->getOpcode() == ISD::ABDS; // abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs)) @@ -9749,34 +9721,37 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const { unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX; unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN; if (isOperationLegal(MaxOpc, VT) && isOperationLegal(MinOpc, VT)) { + LHS = DAG.getFreeze(LHS); + RHS = DAG.getFreeze(RHS); SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS); SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS); return DAG.getNode(ISD::SUB, dl, VT, Max, Min); } // abdu(lhs, rhs) -> or(usubsat(lhs,rhs), usubsat(rhs,lhs)) - if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT)) + if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT)) { + LHS = DAG.getFreeze(LHS); + RHS = DAG.getFreeze(RHS); return DAG.getNode(ISD::OR, dl, VT, DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS), DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS)); + } // If the subtract doesn't overflow then just use abs(sub()) - // NOTE: don't use frozen operands for value tracking. - bool IsNonNegative = DAG.SignBitIsZero(N->getOperand(1)) && - DAG.SignBitIsZero(N->getOperand(0)); + bool IsNonNegative = DAG.SignBitIsZero(LHS) && DAG.SignBitIsZero(RHS); - if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, N->getOperand(0), - N->getOperand(1))) + if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, LHS, RHS)) return DAG.getNode(ISD::ABS, dl, VT, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS)); - if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, N->getOperand(1), - N->getOperand(0))) + if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, RHS, LHS)) return DAG.getNode(ISD::ABS, dl, VT, DAG.getNode(ISD::SUB, dl, VT, RHS, LHS)); EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT; + LHS = DAG.getFreeze(LHS); + RHS = DAG.getFreeze(RHS); SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC); // Branchless expansion iff cmp result is allbits: diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 9e49dddd46ba..0d7b128fc736 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -996,7 +996,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, (MI->getOpcode() != CombineOpc && CombineOpc != 0)) return false; // Must only used by the user we combine with. - if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + if (!MRI.hasOneNonDBGUse(MO.getReg())) return false; return true; @@ -1456,11 +1456,13 @@ void TargetInstrInfo::reassociateOps( MIB1->clearFlag(MachineInstr::MIFlag::NoSWrap); MIB1->clearFlag(MachineInstr::MIFlag::NoUWrap); MIB1->clearFlag(MachineInstr::MIFlag::IsExact); + MIB1->clearFlag(MachineInstr::MIFlag::Disjoint); MIB2->setFlags(IntersectedFlags); MIB2->clearFlag(MachineInstr::MIFlag::NoSWrap); MIB2->clearFlag(MachineInstr::MIFlag::NoUWrap); MIB2->clearFlag(MachineInstr::MIFlag::IsExact); + MIB2->clearFlag(MachineInstr::MIFlag::Disjoint); setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 9ffced80b07f..c23281a820b2 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -612,23 +612,23 @@ RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) { ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate( RTLIB::LibcallImpl Impl) const { switch (Impl) { - case RTLIB::__aeabi_dcmpeq__une: - case RTLIB::__aeabi_fcmpeq__une: + case RTLIB::impl___aeabi_dcmpeq__une: + case RTLIB::impl___aeabi_fcmpeq__une: // Usage in the eq case, so we have to invert the comparison. return ISD::SETEQ; - case RTLIB::__aeabi_dcmpeq__oeq: - case RTLIB::__aeabi_fcmpeq__oeq: + case RTLIB::impl___aeabi_dcmpeq__oeq: + case RTLIB::impl___aeabi_fcmpeq__oeq: // Normal comparison to boolean value. return ISD::SETNE; - case RTLIB::__aeabi_dcmplt: - case RTLIB::__aeabi_dcmple: - case RTLIB::__aeabi_dcmpge: - case RTLIB::__aeabi_dcmpgt: - case RTLIB::__aeabi_dcmpun: - case RTLIB::__aeabi_fcmplt: - case RTLIB::__aeabi_fcmple: - case RTLIB::__aeabi_fcmpge: - case RTLIB::__aeabi_fcmpgt: + case RTLIB::impl___aeabi_dcmplt: + case RTLIB::impl___aeabi_dcmple: + case RTLIB::impl___aeabi_dcmpge: + case RTLIB::impl___aeabi_dcmpgt: + case RTLIB::impl___aeabi_dcmpun: + case RTLIB::impl___aeabi_fcmplt: + case RTLIB::impl___aeabi_fcmple: + case RTLIB::impl___aeabi_fcmpge: + case RTLIB::impl___aeabi_fcmpgt: /// The AEABI versions return a typical boolean value, so we can compare /// against the integer result as simply != 0. return ISD::SETNE; @@ -900,6 +900,9 @@ void TargetLoweringBase::initActions() { // Masked vector extracts default to expand. setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Expand); + setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Expand); + setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Expand); + // FP environment operations default to expand. setOperationAction(ISD::GET_FPENV, VT, Expand); setOperationAction(ISD::SET_FPENV, VT, Expand); @@ -2406,6 +2409,34 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI, return Flags; } +MachineMemOperand::Flags TargetLoweringBase::getVPIntrinsicMemOperandFlags( + const VPIntrinsic &VPIntrin) const { + MachineMemOperand::Flags Flags = MachineMemOperand::MONone; + Intrinsic::ID IntrinID = VPIntrin.getIntrinsicID(); + + switch (IntrinID) { + default: + llvm_unreachable("unexpected intrinsic. Existing code may be appropriate " + "for it, but support must be explicitly enabled"); + case Intrinsic::vp_load: + case Intrinsic::vp_gather: + case Intrinsic::experimental_vp_strided_load: + Flags = MachineMemOperand::MOLoad; + break; + case Intrinsic::vp_store: + case Intrinsic::vp_scatter: + case Intrinsic::experimental_vp_strided_store: + Flags = MachineMemOperand::MOStore; + break; + } + + if (VPIntrin.hasMetadata(LLVMContext::MD_nontemporal)) + Flags |= MachineMemOperand::MONonTemporal; + + Flags |= getTargetMMOFlags(VPIntrin); + return Flags; +} + Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const { diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index d19ef923ef74..ae681b9aebdf 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -247,6 +247,8 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx, break; case Triple::riscv32: case Triple::riscv64: + case Triple::riscv32be: + case Triple::riscv64be: LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4; PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4; @@ -1918,6 +1920,13 @@ void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer, } emitCGProfileMetadata(Streamer, M); + emitPseudoProbeDescMetadata(Streamer, M, [](MCStreamer &Streamer) { + if (MCSymbol *Sym = + static_cast<MCSectionCOFF *>(Streamer.getCurrentSectionOnly()) + ->getCOMDATSymbol()) + if (Sym->isUndefined()) + Streamer.emitLabel(Sym); + }); } void TargetLoweringObjectFileCOFF::emitLinkerDirectives( diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 7d7c6e743fa7..b6169e6c4dc3 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -134,12 +134,18 @@ static cl::opt<cl::boolOrDefault> DebugifyCheckAndStripAll( static cl::opt<RunOutliner> EnableMachineOutliner( "enable-machine-outliner", cl::desc("Enable the machine outliner"), cl::Hidden, cl::ValueOptional, cl::init(RunOutliner::TargetDefault), - cl::values(clEnumValN(RunOutliner::AlwaysOutline, "always", - "Run on all functions guaranteed to be beneficial"), - clEnumValN(RunOutliner::NeverOutline, "never", - "Disable all outlining"), - // Sentinel value for unspecified option. - clEnumValN(RunOutliner::AlwaysOutline, "", ""))); + cl::values( + clEnumValN(RunOutliner::AlwaysOutline, "always", + "Run on all functions guaranteed to be beneficial"), + clEnumValN(RunOutliner::OptimisticPGO, "optimistic-pgo", + "Outline cold code only. If a code block does not have " + "profile data, optimistically assume it is cold."), + clEnumValN(RunOutliner::ConservativePGO, "conservative-pgo", + "Outline cold code only. If a code block does not have " + "profile, data, conservatively assume it is hot."), + clEnumValN(RunOutliner::NeverOutline, "never", "Disable all outlining"), + // Sentinel value for unspecified option. + clEnumValN(RunOutliner::AlwaysOutline, "", ""))); static cl::opt<bool> EnableGlobalMergeFunc( "enable-global-merge-func", cl::Hidden, cl::desc("Enable global merge functions that are based on hash function")); @@ -1074,7 +1080,7 @@ bool TargetPassConfig::addISelPasses() { PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); addPass(createPreISelIntrinsicLoweringPass()); addPass(createExpandLargeDivRemPass()); - addPass(createExpandFpPass()); + addPass(createExpandFpPass(getOptLevel())); addIRPasses(); addCodeGenPrepare(); addPassesToHandleExceptions(); @@ -1224,12 +1230,9 @@ void TargetPassConfig::addMachinePasses() { if (TM->Options.EnableMachineOutliner && getOptLevel() != CodeGenOptLevel::None && EnableMachineOutliner != RunOutliner::NeverOutline) { - bool RunOnAllFunctions = - (EnableMachineOutliner == RunOutliner::AlwaysOutline); - bool AddOutliner = - RunOnAllFunctions || TM->Options.SupportsDefaultOutlining; - if (AddOutliner) - addPass(createMachineOutlinerPass(RunOnAllFunctions)); + if (EnableMachineOutliner != RunOutliner::TargetDefault || + TM->Options.SupportsDefaultOutlining) + addPass(createMachineOutlinerPass(EnableMachineOutliner)); } if (GCEmptyBlocks) |
