summaryrefslogtreecommitdiff
path: root/llvm/lib/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/CodeGen')
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp23
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp6
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp9
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h3
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp14
-rw-r--r--llvm/lib/CodeGen/AtomicExpandPass.cpp26
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp19
-rw-r--r--llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp10
-rw-r--r--llvm/lib/CodeGen/CommandFlags.cpp9
-rw-r--r--llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp54
-rw-r--r--llvm/lib/CodeGen/ExpandFp.cpp496
-rw-r--r--llvm/lib/CodeGen/ExpandVectorPredication.cpp122
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CallLowering.cpp2
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp62
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp506
-rw-r--r--llvm/lib/CodeGen/GlobalISel/Utils.cpp10
-rw-r--r--llvm/lib/CodeGen/InterleavedAccessPass.cpp14
-rw-r--r--llvm/lib/CodeGen/LiveDebugVariables.cpp12
-rw-r--r--llvm/lib/CodeGen/MachineFunctionAnalysis.cpp4
-rw-r--r--llvm/lib/CodeGen/MachineInstrBundle.cpp22
-rw-r--r--llvm/lib/CodeGen/MachineOutliner.cpp99
-rw-r--r--llvm/lib/CodeGen/MachineRegisterInfo.cpp5
-rw-r--r--llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp55
-rw-r--r--llvm/lib/CodeGen/ReachingDefAnalysis.cpp1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp113
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp11
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h5
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp51
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp66
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp60
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp102
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp10
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp93
-rw-r--r--llvm/lib/CodeGen/TargetInstrInfo.cpp4
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp57
-rw-r--r--llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp9
-rw-r--r--llvm/lib/CodeGen/TargetPassConfig.cpp29
41 files changed, 1755 insertions, 450 deletions
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 23a3543e9ebe..cd14a4f57f76 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1432,7 +1432,7 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
MCSection *BBAddrMapSection =
getObjFileLowering().getBBAddrMapSection(*MF.getSection());
assert(BBAddrMapSection && ".llvm_bb_addr_map section is not initialized.");
- bool HasCalls = !CurrentFnCallsiteSymbols.empty();
+ bool HasCalls = !CurrentFnCallsiteEndSymbols.empty();
const MCSymbol *FunctionSymbol = getFunctionBegin();
@@ -1497,13 +1497,13 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
emitLabelDifferenceAsULEB128(MBBSymbol, PrevMBBEndSymbol);
const MCSymbol *CurrentLabel = MBBSymbol;
if (HasCalls) {
- auto CallsiteSymbols = CurrentFnCallsiteSymbols.lookup(&MBB);
+ auto CallsiteEndSymbols = CurrentFnCallsiteEndSymbols.lookup(&MBB);
OutStreamer->AddComment("number of callsites");
- OutStreamer->emitULEB128IntValue(CallsiteSymbols.size());
- for (const MCSymbol *CallsiteSymbol : CallsiteSymbols) {
+ OutStreamer->emitULEB128IntValue(CallsiteEndSymbols.size());
+ for (const MCSymbol *CallsiteEndSymbol : CallsiteEndSymbols) {
// Emit the callsite offset.
- emitLabelDifferenceAsULEB128(CallsiteSymbol, CurrentLabel);
- CurrentLabel = CallsiteSymbol;
+ emitLabelDifferenceAsULEB128(CallsiteEndSymbol, CurrentLabel);
+ CurrentLabel = CallsiteEndSymbol;
}
}
// Emit the offset to the end of the block, which can be used to compute
@@ -1941,8 +1941,6 @@ void AsmPrinter::emitFunctionBody() {
!MI.isDebugInstr()) {
HasAnyRealCode = true;
}
- if (MI.isCall() && MF->getTarget().Options.BBAddrMap)
- OutStreamer->emitLabel(createCallsiteSymbol(MBB));
// If there is a pre-instruction symbol, emit a label for it here.
if (MCSymbol *S = MI.getPreInstrSymbol())
@@ -2064,6 +2062,9 @@ void AsmPrinter::emitFunctionBody() {
break;
}
+ if (MI.isCall() && MF->getTarget().Options.BBAddrMap)
+ OutStreamer->emitLabel(createCallsiteEndSymbol(MBB));
+
if (TM.Options.EmitCallGraphSection && MI.isCall())
emitIndirectCalleeLabels(FuncInfo, CallSitesInfoMap, MI);
@@ -2897,11 +2898,11 @@ MCSymbol *AsmPrinter::getMBBExceptionSym(const MachineBasicBlock &MBB) {
return Res.first->second;
}
-MCSymbol *AsmPrinter::createCallsiteSymbol(const MachineBasicBlock &MBB) {
+MCSymbol *AsmPrinter::createCallsiteEndSymbol(const MachineBasicBlock &MBB) {
MCContext &Ctx = MF->getContext();
MCSymbol *Sym = Ctx.createTempSymbol("BB" + Twine(MF->getFunctionNumber()) +
"_" + Twine(MBB.getNumber()) + "_CS");
- CurrentFnCallsiteSymbols[&MBB].push_back(Sym);
+ CurrentFnCallsiteEndSymbols[&MBB].push_back(Sym);
return Sym;
}
@@ -2939,7 +2940,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
CurrentFnBegin = nullptr;
CurrentFnBeginLocal = nullptr;
CurrentSectionBeginSym = nullptr;
- CurrentFnCallsiteSymbols.clear();
+ CurrentFnCallsiteEndSymbols.clear();
MBBSectionRanges.clear();
MBBSectionExceptionSyms.clear();
bool NeedsLocalForSize = MAI->needsLocalForSize();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index c27f10077562..2090157a1a91 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3111,8 +3111,10 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
&AP](const DbgValueLocEntry &Entry,
DIExpressionCursor &Cursor) -> bool {
if (Entry.isInt()) {
- if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||
- BT->getEncoding() == dwarf::DW_ATE_signed_char))
+ if (BT && (BT->getEncoding() == dwarf::DW_ATE_boolean))
+ DwarfExpr.addBooleanConstant(Entry.getInt());
+ else if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||
+ BT->getEncoding() == dwarf::DW_ATE_signed_char))
DwarfExpr.addSignedConstant(Entry.getInt());
else
DwarfExpr.addUnsignedConstant(Entry.getInt());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index e684054ffa3e..8a30714db2fd 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -194,6 +194,15 @@ void DwarfExpression::addStackValue() {
emitOp(dwarf::DW_OP_stack_value);
}
+void DwarfExpression::addBooleanConstant(int64_t Value) {
+ assert(isImplicitLocation() || isUnknownLocation());
+ LocationKind = Implicit;
+ if (Value == 0)
+ emitOp(dwarf::DW_OP_lit0);
+ else
+ emitOp(dwarf::DW_OP_lit1);
+}
+
void DwarfExpression::addSignedConstant(int64_t Value) {
assert(isImplicitLocation() || isUnknownLocation());
LocationKind = Implicit;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 06809ab26387..700e0ec5813e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -229,6 +229,9 @@ public:
/// This needs to be called last to commit any pending changes.
void finalize();
+ /// Emit a boolean constant.
+ void addBooleanConstant(int64_t Value);
+
/// Emit a signed constant.
void addSignedConstant(int64_t Value);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index b03fac2d22a5..d76fd0c01020 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1351,6 +1351,13 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) {
ContextDIE = &getUnitDie();
// Build the decl now to ensure it precedes the definition.
getOrCreateSubprogramDIE(SPDecl);
+ // Check whether the DIE for SP has already been created after the call
+ // above.
+ // FIXME: Should the creation of definition subprogram DIE during
+ // the creation of declaration subprogram DIE be allowed?
+ // See https://github.com/llvm/llvm-project/pull/154636.
+ if (DIE *SPDie = getDIE(SP))
+ return SPDie;
}
}
@@ -1403,11 +1410,8 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
// Add the linkage name if we have one and it isn't in the Decl.
StringRef LinkageName = SP->getLinkageName();
- assert(((LinkageName.empty() || DeclLinkageName.empty()) ||
- LinkageName == DeclLinkageName) &&
- "decl has a linkage name and it is different");
- if (DeclLinkageName.empty() &&
- // Always emit it for abstract subprograms.
+ // Always emit linkage name for abstract subprograms.
+ if (DeclLinkageName != LinkageName &&
(DD->useAllLinkageNames() || DU->getAbstractScopeDIEs().lookup(SP)))
addLinkageName(SPDie, LinkageName);
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 278dd6560e73..4931403ab83a 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -84,7 +84,7 @@ private:
bool expandAtomicLoadToCmpXchg(LoadInst *LI);
StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
bool tryExpandAtomicStore(StoreInst *SI);
- void expandAtomicStore(StoreInst *SI);
+ void expandAtomicStoreToXChg(StoreInst *SI);
bool tryExpandAtomicRMW(AtomicRMWInst *AI);
AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
Value *
@@ -537,6 +537,9 @@ bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
LI->setAtomic(AtomicOrdering::NotAtomic);
return true;
+ case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
+ TLI->emitExpandAtomicLoad(LI);
+ return true;
default:
llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
}
@@ -546,8 +549,11 @@ bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
case TargetLoweringBase::AtomicExpansionKind::None:
return false;
+ case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
+ TLI->emitExpandAtomicStore(SI);
+ return true;
case TargetLoweringBase::AtomicExpansionKind::Expand:
- expandAtomicStore(SI);
+ expandAtomicStoreToXChg(SI);
return true;
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
SI->setAtomic(AtomicOrdering::NotAtomic);
@@ -620,7 +626,7 @@ StoreInst *AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst *SI) {
return NewSI;
}
-void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {
+void AtomicExpandImpl::expandAtomicStoreToXChg(StoreInst *SI) {
// This function is only called on atomic stores that are too large to be
// atomic if implemented as a native store. So we replace them by an
// atomic swap, that can be implemented for example as a ldrex/strex on ARM
@@ -741,7 +747,7 @@ bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) {
}
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
return lowerAtomicRMWInst(AI);
- case TargetLoweringBase::AtomicExpansionKind::Expand:
+ case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
TLI->emitExpandAtomicRMW(AI);
return true;
default:
@@ -1454,7 +1460,8 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
// If the cmpxchg doesn't actually need any ordering when it fails, we can
// jump straight past that fence instruction (if it exists).
- Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
+ Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB,
+ MDBuilder(F->getContext()).createLikelyBranchWeights());
Builder.SetInsertPoint(ReleasingStoreBB);
if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
@@ -1473,7 +1480,8 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
Builder.CreateCondBr(StoreSuccess, SuccessBB,
- CI->isWeak() ? FailureBB : RetryBB);
+ CI->isWeak() ? FailureBB : RetryBB,
+ MDBuilder(F->getContext()).createLikelyBranchWeights());
Builder.SetInsertPoint(ReleasedLoadBB);
Value *SecondLoad;
@@ -1486,7 +1494,9 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
// If the cmpxchg doesn't actually need any ordering when it fails, we can
// jump straight past that fence instruction (if it exists).
- Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
+ Builder.CreateCondBr(
+ ShouldStore, TryStoreBB, NoStoreBB,
+ MDBuilder(F->getContext()).createLikelyBranchWeights());
// Update PHI node in TryStoreBB.
LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB);
} else
@@ -1695,7 +1705,7 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
return true;
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
return lowerAtomicCmpXchgInst(CI);
- case TargetLoweringBase::AtomicExpansionKind::Expand: {
+ case TargetLoweringBase::AtomicExpansionKind::CustomExpand: {
TLI->emitExpandAtomicCmpXchg(CI);
return true;
}
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 0e40a92fd8d6..9db4c9e5e280 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2618,22 +2618,9 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, LoopInfo &LI,
bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
BasicBlock *BB = CI->getParent();
- // Lower inline assembly if we can.
- // If we found an inline asm expession, and if the target knows how to
- // lower it to normal LLVM code, do so now.
- if (CI->isInlineAsm()) {
- if (TLI->ExpandInlineAsm(CI)) {
- // Avoid invalidating the iterator.
- CurInstIterator = BB->begin();
- // Avoid processing instructions out of order, which could cause
- // reuse before a value is defined.
- SunkAddrs.clear();
- return true;
- }
- // Sink address computing for memory operands into the block.
- if (optimizeInlineAsmInst(CI))
- return true;
- }
+ // Sink address computing for memory operands into the block.
+ if (CI->isInlineAsm() && optimizeInlineAsmInst(CI))
+ return true;
// Align the pointer arguments to this call if the target thinks it's a good
// idea
diff --git a/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp b/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp
index 442ec3840930..5d7e2b59c204 100644
--- a/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp
+++ b/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp
@@ -45,7 +45,7 @@ static cl::opt<bool> EnableNoTrapAfterNoreturn(
"after noreturn calls, even if --trap-unreachable is set."));
void CodeGenTargetMachineImpl::initAsmInfo() {
- MRI.reset(TheTarget.createMCRegInfo(getTargetTriple().str()));
+ MRI.reset(TheTarget.createMCRegInfo(getTargetTriple()));
assert(MRI && "Unable to create reg info");
MII.reset(TheTarget.createMCInstrInfo());
assert(MII && "Unable to create instruction info");
@@ -53,12 +53,12 @@ void CodeGenTargetMachineImpl::initAsmInfo() {
// to some backends having subtarget feature dependent module level
// code generation. This is similar to the hack in the AsmPrinter for
// module level assembly etc.
- STI.reset(TheTarget.createMCSubtargetInfo(
- getTargetTriple().str(), getTargetCPU(), getTargetFeatureString()));
+ STI.reset(TheTarget.createMCSubtargetInfo(getTargetTriple(), getTargetCPU(),
+ getTargetFeatureString()));
assert(STI && "Unable to create subtarget info");
- MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(
- *MRI, getTargetTriple().str(), Options.MCOptions);
+ MCAsmInfo *TmpAsmInfo =
+ TheTarget.createMCAsmInfo(*MRI, getTargetTriple(), Options.MCOptions);
// TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
// and if the old one gets included then MCAsmInfo will be NULL and
// we'll crash later.
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 810dc29d728d..0522698adf18 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -68,7 +68,6 @@ CGOPT(bool, EnableUnsafeFPMath)
CGOPT(bool, EnableNoInfsFPMath)
CGOPT(bool, EnableNoNaNsFPMath)
CGOPT(bool, EnableNoSignedZerosFPMath)
-CGOPT(bool, EnableApproxFuncFPMath)
CGOPT(bool, EnableNoTrappingFPMath)
CGOPT(bool, EnableAIXExtendedAltivecABI)
CGOPT(DenormalMode::DenormalModeKind, DenormalFPMath)
@@ -245,12 +244,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
cl::init(false));
CGBINDOPT(EnableNoSignedZerosFPMath);
- static cl::opt<bool> EnableApproxFuncFPMath(
- "enable-approx-func-fp-math",
- cl::desc("Enable FP math optimizations that assume approx func"),
- cl::init(false));
- CGBINDOPT(EnableApproxFuncFPMath);
-
static cl::opt<bool> EnableNoTrappingFPMath(
"enable-no-trapping-fp-math",
cl::desc("Enable setting the FP exceptions build "
@@ -563,7 +556,6 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
Options.NoInfsFPMath = getEnableNoInfsFPMath();
Options.NoNaNsFPMath = getEnableNoNaNsFPMath();
Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath();
- Options.ApproxFuncFPMath = getEnableApproxFuncFPMath();
Options.NoTrappingFPMath = getEnableNoTrappingFPMath();
DenormalMode::DenormalModeKind DenormKind = getDenormalFPMath();
@@ -718,7 +710,6 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math");
HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math");
HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math");
- HANDLE_BOOL_ATTR(EnableApproxFuncFPMathView, "approx-func-fp-math");
if (DenormalFPMathView->getNumOccurrences() > 0 &&
!F.hasFnAttribute("denormal-fp-math")) {
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index de95e0aaf2cb..7d355e6e365d 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -60,6 +60,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
+#include "llvm/ADT/AllocatorList.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -263,6 +264,7 @@ public:
};
using Addend = std::pair<Value *, bool>;
+ using AddendList = BumpPtrList<Addend>;
using CompositeNode = ComplexDeinterleavingCompositeNode::CompositeNode;
// Helper struct for holding info about potential partial multiplication
@@ -291,7 +293,7 @@ private:
SmallPtrSet<Instruction *, 16> FinalInstructions;
/// Root instructions are instructions from which complex computation starts
- std::map<Instruction *, CompositeNode *> RootToNode;
+ DenseMap<Instruction *, CompositeNode *> RootToNode;
/// Topologically sorted root instructions
SmallVector<Instruction *, 1> OrderedRoots;
@@ -339,7 +341,7 @@ private:
/// ComplexDeinterleavingOperation::ReductionPHI node replacement. It is then
/// used in the ComplexDeinterleavingOperation::ReductionOperation node
/// replacement process.
- std::map<PHINode *, PHINode *> OldToNewPHI;
+ DenseMap<PHINode *, PHINode *> OldToNewPHI;
CompositeNode *prepareCompositeNode(ComplexDeinterleavingOperation Operation,
Value *R, Value *I) {
@@ -417,28 +419,28 @@ private:
/// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
/// Return nullptr if it is not possible to construct a complex number.
/// \p Flags are needed to generate symmetric Add and Sub operations.
- CompositeNode *identifyAdditions(std::list<Addend> &RealAddends,
- std::list<Addend> &ImagAddends,
+ CompositeNode *identifyAdditions(AddendList &RealAddends,
+ AddendList &ImagAddends,
std::optional<FastMathFlags> Flags,
CompositeNode *Accumulator);
/// Extract one addend that have both real and imaginary parts positive.
- CompositeNode *extractPositiveAddend(std::list<Addend> &RealAddends,
- std::list<Addend> &ImagAddends);
+ CompositeNode *extractPositiveAddend(AddendList &RealAddends,
+ AddendList &ImagAddends);
/// Determine if sum of multiplications of complex numbers can be formed from
/// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result
/// to it. Return nullptr if it is not possible to construct a complex number.
- CompositeNode *identifyMultiplications(std::vector<Product> &RealMuls,
- std::vector<Product> &ImagMuls,
+ CompositeNode *identifyMultiplications(SmallVectorImpl<Product> &RealMuls,
+ SmallVectorImpl<Product> &ImagMuls,
CompositeNode *Accumulator);
/// Go through pairs of multiplication (one Real and one Imag) and find all
/// possible candidates for partial multiplication and put them into \p
/// Candidates. Returns true if all Product has pair with common operand
- bool collectPartialMuls(const std::vector<Product> &RealMuls,
- const std::vector<Product> &ImagMuls,
- std::vector<PartialMulCandidate> &Candidates);
+ bool collectPartialMuls(ArrayRef<Product> RealMuls,
+ ArrayRef<Product> ImagMuls,
+ SmallVectorImpl<PartialMulCandidate> &Candidates);
/// If the code is compiled with -Ofast or expressions have `reassoc` flag,
/// the order of complex computation operations may be significantly altered,
@@ -1255,8 +1257,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
// Collect multiplications and addend instructions from the given instruction
// while traversing it operands. Additionally, verify that all instructions
// have the same fast math flags.
- auto Collect = [&Flags](Instruction *Insn, std::vector<Product> &Muls,
- std::list<Addend> &Addends) -> bool {
+ auto Collect = [&Flags](Instruction *Insn, SmallVectorImpl<Product> &Muls,
+ AddendList &Addends) -> bool {
SmallVector<PointerIntPair<Value *, 1, bool>> Worklist = {{Insn, true}};
SmallPtrSet<Value *, 8> Visited;
while (!Worklist.empty()) {
@@ -1336,8 +1338,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
return true;
};
- std::vector<Product> RealMuls, ImagMuls;
- std::list<Addend> RealAddends, ImagAddends;
+ SmallVector<Product> RealMuls, ImagMuls;
+ AddendList RealAddends, ImagAddends;
if (!Collect(Real, RealMuls, RealAddends) ||
!Collect(Imag, ImagMuls, ImagAddends))
return nullptr;
@@ -1371,8 +1373,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
}
bool ComplexDeinterleavingGraph::collectPartialMuls(
- const std::vector<Product> &RealMuls, const std::vector<Product> &ImagMuls,
- std::vector<PartialMulCandidate> &PartialMulCandidates) {
+ ArrayRef<Product> RealMuls, ArrayRef<Product> ImagMuls,
+ SmallVectorImpl<PartialMulCandidate> &PartialMulCandidates) {
// Helper function to extract a common operand from two products
auto FindCommonInstruction = [](const Product &Real,
const Product &Imag) -> Value * {
@@ -1423,18 +1425,18 @@ bool ComplexDeinterleavingGraph::collectPartialMuls(
ComplexDeinterleavingGraph::CompositeNode *
ComplexDeinterleavingGraph::identifyMultiplications(
- std::vector<Product> &RealMuls, std::vector<Product> &ImagMuls,
+ SmallVectorImpl<Product> &RealMuls, SmallVectorImpl<Product> &ImagMuls,
CompositeNode *Accumulator = nullptr) {
if (RealMuls.size() != ImagMuls.size())
return nullptr;
- std::vector<PartialMulCandidate> Info;
+ SmallVector<PartialMulCandidate> Info;
if (!collectPartialMuls(RealMuls, ImagMuls, Info))
return nullptr;
// Map to store common instruction to node pointers
- std::map<Value *, CompositeNode *> CommonToNode;
- std::vector<bool> Processed(Info.size(), false);
+ DenseMap<Value *, CompositeNode *> CommonToNode;
+ SmallVector<bool> Processed(Info.size(), false);
for (unsigned I = 0; I < Info.size(); ++I) {
if (Processed[I])
continue;
@@ -1463,8 +1465,8 @@ ComplexDeinterleavingGraph::identifyMultiplications(
}
}
- std::vector<bool> ProcessedReal(RealMuls.size(), false);
- std::vector<bool> ProcessedImag(ImagMuls.size(), false);
+ SmallVector<bool> ProcessedReal(RealMuls.size(), false);
+ SmallVector<bool> ProcessedImag(ImagMuls.size(), false);
CompositeNode *Result = Accumulator;
for (auto &PMI : Info) {
if (ProcessedReal[PMI.RealIdx] || ProcessedImag[PMI.ImagIdx])
@@ -1580,7 +1582,7 @@ ComplexDeinterleavingGraph::identifyMultiplications(
ComplexDeinterleavingGraph::CompositeNode *
ComplexDeinterleavingGraph::identifyAdditions(
- std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends,
+ AddendList &RealAddends, AddendList &ImagAddends,
std::optional<FastMathFlags> Flags, CompositeNode *Accumulator = nullptr) {
if (RealAddends.size() != ImagAddends.size())
return nullptr;
@@ -1671,8 +1673,8 @@ ComplexDeinterleavingGraph::identifyAdditions(
}
ComplexDeinterleavingGraph::CompositeNode *
-ComplexDeinterleavingGraph::extractPositiveAddend(
- std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends) {
+ComplexDeinterleavingGraph::extractPositiveAddend(AddendList &RealAddends,
+ AddendList &ImagAddends) {
for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) {
for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) {
auto [R, IsPositiveR] = *ItR;
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 1c1047c1ce18..9cc6c6a706c5 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -16,18 +16,29 @@
#include "llvm/CodeGen/ExpandFp.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/SimplifyQuery.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/RuntimeLibcalls.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <optional>
+
+#define DEBUG_TYPE "expand-fp"
using namespace llvm;
@@ -37,6 +48,359 @@ static cl::opt<unsigned>
cl::desc("fp convert instructions on integers with "
"more than <N> bits are expanded."));
+namespace {
+/// This class implements a precise expansion of the frem instruction.
+/// The generated code is based on the fmod implementation in the AMD device
+/// libs.
+class FRemExpander {
+ /// The IRBuilder to use for the expansion.
+ IRBuilder<> &B;
+
+ /// Floating point type of the return value and the arguments of the FRem
+ /// instructions that should be expanded.
+ Type *FremTy;
+
+ /// Floating point type to use for the computation. This may be
+ /// wider than the \p FremTy.
+ Type *ComputeFpTy;
+
+ /// Integer type used to hold the exponents returned by frexp.
+ Type *ExTy;
+
+ /// How many bits of the quotient to compute per iteration of the
+ /// algorithm, stored as a value of type \p ExTy.
+ Value *Bits;
+
+ /// Constant 1 of type \p ExTy.
+ Value *One;
+
+public:
+ static bool canExpandType(Type *Ty) {
+ // TODO The expansion should work for other floating point types
+ // as well, but this would require additional testing.
+ return Ty->isIEEELikeFPTy() && !Ty->isBFloatTy() && !Ty->isFP128Ty();
+ }
+
+ static FRemExpander create(IRBuilder<> &B, Type *Ty) {
+ assert(canExpandType(Ty));
+
+ // The type to use for the computation of the remainder. This may be
+ // wider than the input/result type which affects the ...
+ Type *ComputeTy = Ty;
+ // ... maximum number of iterations of the remainder computation loop
+ // to use. This value is for the case in which the computation
+ // uses the same input/result type.
+ unsigned MaxIter = 2;
+
+ if (Ty->isHalfTy()) {
+ // Use the wider type and less iterations.
+ ComputeTy = B.getFloatTy();
+ MaxIter = 1;
+ }
+
+ unsigned Precision =
+ llvm::APFloat::semanticsPrecision(Ty->getFltSemantics());
+ return FRemExpander{B, Ty, Precision / MaxIter, ComputeTy};
+ }
+
+ /// Build the FRem expansion for the numerator \p X and the
+ /// denumerator \p Y. The type of X and Y must match \p FremTy. The
+ /// code will be generated at the insertion point of \p B and the
+ /// insertion point will be reset at exit.
+ Value *buildFRem(Value *X, Value *Y, std::optional<SimplifyQuery> &SQ) const;
+
+ /// Build an approximate FRem expansion for the numerator \p X and
+ /// the denumerator \p Y at the insertion point of builder \p B.
+ /// The type of X and Y must match \p FremTy.
+ Value *buildApproxFRem(Value *X, Value *Y) const;
+
+private:
+ FRemExpander(IRBuilder<> &B, Type *FremTy, unsigned Bits, Type *ComputeFpTy)
+ : B(B), FremTy(FremTy), ComputeFpTy(ComputeFpTy), ExTy(B.getInt32Ty()),
+ Bits(ConstantInt::get(ExTy, Bits)), One(ConstantInt::get(ExTy, 1)) {};
+
+ Value *createRcp(Value *V, const Twine &Name) const {
+ // Leave it to later optimizations to turn this into an rcp
+ // instruction if available.
+ return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name);
+ }
+
+ // Helper function to build the UPDATE_AX code which is common to the
+ // loop body and the "final iteration".
+ Value *buildUpdateAx(Value *Ax, Value *Ay, Value *Ayinv) const {
+ // Build:
+ // float q = rint(ax * ayinv);
+ // ax = fma(-q, ay, ax);
+ // int clt = ax < 0.0f;
+ // float axp = ax + ay;
+ // ax = clt ? axp : ax;
+ Value *Q = B.CreateUnaryIntrinsic(Intrinsic::rint, B.CreateFMul(Ax, Ayinv),
+ {}, "q");
+ Value *AxUpdate = B.CreateFMA(B.CreateFNeg(Q), Ay, Ax, {}, "ax");
+ Value *Clt = B.CreateFCmp(CmpInst::FCMP_OLT, AxUpdate,
+ ConstantFP::getZero(ComputeFpTy), "clt");
+ Value *Axp = B.CreateFAdd(AxUpdate, Ay, "axp");
+ return B.CreateSelect(Clt, Axp, AxUpdate, "ax");
+ }
+
+ /// Build code to extract the exponent and mantissa of \p Src.
+ /// Return the exponent minus one for use as a loop bound and
+ /// the mantissa taken to the given \p NewExp power.
+ std::pair<Value *, Value *> buildExpAndPower(Value *Src, Value *NewExp,
+ const Twine &ExName,
+ const Twine &PowName) const {
+ // Build:
+ // ExName = frexp_exp(Src) - 1;
+ // PowName = fldexp(frexp_mant(ExName), NewExp);
+ Type *Ty = Src->getType();
+ Type *ExTy = B.getInt32Ty();
+ Value *Frexp = B.CreateIntrinsic(Intrinsic::frexp, {Ty, ExTy}, Src);
+ Value *Mant = B.CreateExtractValue(Frexp, {0});
+ Value *Exp = B.CreateExtractValue(Frexp, {1});
+
+ Exp = B.CreateSub(Exp, One, ExName);
+ Value *Pow = B.CreateLdexp(Mant, NewExp, {}, PowName);
+
+ return {Pow, Exp};
+ }
+
+ /// Build the main computation of the remainder for the case in which
+ /// Ax > Ay, where Ax = |X|, Ay = |Y|, and X is the numerator and Y the
+ /// denumerator. Add the incoming edge from the computation result
+ /// to \p RetPhi.
+ void buildRemainderComputation(Value *AxInitial, Value *AyInitial, Value *X,
+ PHINode *RetPhi, FastMathFlags FMF) const {
+ IRBuilder<>::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(FMF);
+
+ // Build:
+ // ex = frexp_exp(ax) - 1;
+ // ax = fldexp(frexp_mant(ax), bits);
+ // ey = frexp_exp(ay) - 1;
+ // ay = fledxp(frexp_mant(ay), 1);
+ auto [Ax, Ex] = buildExpAndPower(AxInitial, Bits, "ex", "ax");
+ auto [Ay, Ey] = buildExpAndPower(AyInitial, One, "ey", "ay");
+
+ // Build:
+ // int nb = ex - ey;
+ // float ayinv = 1.0/ay;
+ Value *Nb = B.CreateSub(Ex, Ey, "nb");
+ Value *Ayinv = createRcp(Ay, "ayinv");
+
+ // Build: while (nb > bits)
+ BasicBlock *PreheaderBB = B.GetInsertBlock();
+ Function *Fun = PreheaderBB->getParent();
+ auto *LoopBB = BasicBlock::Create(B.getContext(), "frem.loop_body", Fun);
+ auto *ExitBB = BasicBlock::Create(B.getContext(), "frem.loop_exit", Fun);
+
+ B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, Nb, Bits), LoopBB, ExitBB);
+
+ // Build loop body:
+ // UPDATE_AX
+ // ax = fldexp(ax, bits);
+ // nb -= bits;
+ // One iteration of the loop is factored out. The code shared by
+ // the loop and this "iteration" is denoted by UPDATE_AX.
+ B.SetInsertPoint(LoopBB);
+ PHINode *NbIv = B.CreatePHI(Nb->getType(), 2, "nb_iv");
+ NbIv->addIncoming(Nb, PreheaderBB);
+
+ auto *AxPhi = B.CreatePHI(ComputeFpTy, 2, "ax_loop_phi");
+ AxPhi->addIncoming(Ax, PreheaderBB);
+
+ Value *AxPhiUpdate = buildUpdateAx(AxPhi, Ay, Ayinv);
+ AxPhiUpdate = B.CreateLdexp(AxPhiUpdate, Bits, {}, "ax_update");
+ AxPhi->addIncoming(AxPhiUpdate, LoopBB);
+ NbIv->addIncoming(B.CreateSub(NbIv, Bits, "nb_update"), LoopBB);
+
+ B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, NbIv, Bits), LoopBB, ExitBB);
+
+ // Build final iteration
+ // ax = fldexp(ax, nb - bits + 1);
+ // UPDATE_AX
+ B.SetInsertPoint(ExitBB);
+
+ auto *AxPhiExit = B.CreatePHI(ComputeFpTy, 2, "ax_exit_phi");
+ AxPhiExit->addIncoming(Ax, PreheaderBB);
+ AxPhiExit->addIncoming(AxPhi, LoopBB);
+ auto *NbExitPhi = B.CreatePHI(Nb->getType(), 2, "nb_exit_phi");
+ NbExitPhi->addIncoming(NbIv, LoopBB);
+ NbExitPhi->addIncoming(Nb, PreheaderBB);
+
+ Value *AxFinal = B.CreateLdexp(
+ AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), {}, "ax");
+ AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv);
+
+ // Build:
+ // ax = fldexp(ax, ey);
+ // ret = copysign(ax,x);
+ AxFinal = B.CreateLdexp(AxFinal, Ey, {}, "ax");
+ if (ComputeFpTy != FremTy)
+ AxFinal = B.CreateFPTrunc(AxFinal, FremTy);
+ Value *Ret = B.CreateCopySign(AxFinal, X);
+
+ RetPhi->addIncoming(Ret, ExitBB);
+ }
+
+ /// Build the else-branch of the conditional in the FRem
+ /// expansion, i.e. the case in wich Ax <= Ay, where Ax = |X|, Ay
+ /// = |Y|, and X is the numerator and Y the denumerator. Add the
+ /// incoming edge from the result to \p RetPhi.
+ void buildElseBranch(Value *Ax, Value *Ay, Value *X, PHINode *RetPhi) const {
+ // Build:
+ // ret = ax == ay ? copysign(0.0f, x) : x;
+ Value *ZeroWithXSign = B.CreateCopySign(ConstantFP::getZero(FremTy), X);
+ Value *Ret = B.CreateSelect(B.CreateFCmpOEQ(Ax, Ay), ZeroWithXSign, X);
+
+ RetPhi->addIncoming(Ret, B.GetInsertBlock());
+ }
+
+ /// Return a value that is NaN if one of the corner cases concerning
+ /// the inputs \p X and \p Y is detected, and \p Ret otherwise.
+ Value *handleInputCornerCases(Value *Ret, Value *X, Value *Y,
+ std::optional<SimplifyQuery> &SQ,
+ bool NoInfs) const {
+ // Build:
+ // ret = (y == 0.0f || isnan(y)) ? QNAN : ret;
+ // ret = isfinite(x) ? ret : QNAN;
+ Value *Nan = ConstantFP::getQNaN(FremTy);
+ Ret = B.CreateSelect(B.CreateFCmpUEQ(Y, ConstantFP::getZero(FremTy)), Nan,
+ Ret);
+ Value *XFinite =
+ NoInfs || (SQ && isKnownNeverInfinity(X, *SQ))
+ ? B.getTrue()
+ : B.CreateFCmpULT(B.CreateUnaryIntrinsic(Intrinsic::fabs, X),
+ ConstantFP::getInfinity(FremTy));
+ Ret = B.CreateSelect(XFinite, Ret, Nan);
+
+ return Ret;
+ }
+};
+
+Value *FRemExpander::buildApproxFRem(Value *X, Value *Y) const {
+ IRBuilder<>::FastMathFlagGuard Guard(B);
+ // Propagating the approximate functions flag to the
+ // division leads to an unacceptable drop in precision
+ // on AMDGPU.
+ // TODO Find out if any flags might be worth propagating.
+ B.clearFastMathFlags();
+
+ Value *Quot = B.CreateFDiv(X, Y);
+ Value *Trunc = B.CreateUnaryIntrinsic(Intrinsic::trunc, Quot, {});
+ Value *Neg = B.CreateFNeg(Trunc);
+
+ return B.CreateFMA(Neg, Y, X);
+}
+
+Value *FRemExpander::buildFRem(Value *X, Value *Y,
+ std::optional<SimplifyQuery> &SQ) const {
+ assert(X->getType() == FremTy && Y->getType() == FremTy);
+
+ FastMathFlags FMF = B.getFastMathFlags();
+
+ // This function generates the following code structure:
+ // if (abs(x) > abs(y))
+ // { ret = compute remainder }
+ // else
+ // { ret = x or 0 with sign of x }
+ // Adjust ret to NaN/inf in input
+ // return ret
+ Value *Ax = B.CreateUnaryIntrinsic(Intrinsic::fabs, X, {}, "ax");
+ Value *Ay = B.CreateUnaryIntrinsic(Intrinsic::fabs, Y, {}, "ay");
+ if (ComputeFpTy != X->getType()) {
+ Ax = B.CreateFPExt(Ax, ComputeFpTy, "ax");
+ Ay = B.CreateFPExt(Ay, ComputeFpTy, "ay");
+ }
+ Value *AxAyCmp = B.CreateFCmpOGT(Ax, Ay);
+
+ PHINode *RetPhi = B.CreatePHI(FremTy, 2, "ret");
+ Value *Ret = RetPhi;
+
+ // We would return NaN in all corner cases handled here.
+ // Hence, if NaNs are excluded, keep the result as it is.
+ if (!FMF.noNaNs())
+ Ret = handleInputCornerCases(Ret, X, Y, SQ, FMF.noInfs());
+
+ Function *Fun = B.GetInsertBlock()->getParent();
+ auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun);
+ auto *ElseBB = BasicBlock::Create(B.getContext(), "frem.else", Fun);
+ SplitBlockAndInsertIfThenElse(AxAyCmp, RetPhi, &ThenBB, &ElseBB);
+
+ auto SavedInsertPt = B.GetInsertPoint();
+
+ // Build remainder computation for "then" branch
+ //
+ // The ordered comparison ensures that ax and ay are not NaNs
+ // in the then-branch. Furthermore, y cannot be an infinity and the
+ // check at the end of the function ensures that the result will not
+ // be used if x is an infinity.
+ FastMathFlags ComputeFMF = FMF;
+ ComputeFMF.setNoInfs();
+ ComputeFMF.setNoNaNs();
+
+ B.SetInsertPoint(ThenBB);
+ buildRemainderComputation(Ax, Ay, X, RetPhi, FMF);
+ B.CreateBr(RetPhi->getParent());
+
+ // Build "else"-branch
+ B.SetInsertPoint(ElseBB);
+ buildElseBranch(Ax, Ay, X, RetPhi);
+ B.CreateBr(RetPhi->getParent());
+
+ B.SetInsertPoint(SavedInsertPt);
+
+ return Ret;
+}
+} // namespace
+
+static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
+ LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
+
+ Type *ReturnTy = I.getType();
+ assert(FRemExpander::canExpandType(ReturnTy->getScalarType()));
+
+ FastMathFlags FMF = I.getFastMathFlags();
+ // TODO Make use of those flags for optimization?
+ FMF.setAllowReciprocal(false);
+ FMF.setAllowContract(false);
+
+ IRBuilder<> B(&I);
+ B.setFastMathFlags(FMF);
+ B.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *ElemTy = ReturnTy->getScalarType();
+ const FRemExpander Expander = FRemExpander::create(B, ElemTy);
+
+ Value *Ret;
+ if (ReturnTy->isFloatingPointTy())
+ Ret = FMF.approxFunc()
+ ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
+ : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
+ else {
+ auto *VecTy = cast<FixedVectorType>(ReturnTy);
+
+ // This could use SplitBlockAndInsertForEachLane but the interface
+ // is a bit awkward for a constant number of elements and it will
+ // boil down to the same code.
+ // TODO Expand the FRem instruction only once and reuse the code.
+ Value *Nums = I.getOperand(0);
+ Value *Denums = I.getOperand(1);
+ Ret = PoisonValue::get(I.getType());
+ for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
+ Value *Num = B.CreateExtractElement(Nums, I);
+ Value *Denum = B.CreateExtractElement(Denums, I);
+ Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum)
+ : Expander.buildFRem(Num, Denum, SQ);
+ Ret = B.CreateInsertElement(Ret, Rem, I);
+ }
+ }
+
+ I.replaceAllUsesWith(Ret);
+ Ret->takeName(&I);
+ I.eraseFromParent();
+
+ return true;
+}
// clang-format off: preserve formatting of the following example
/// Generate code to convert a fp number to integer, replacing FPToS(U)I with
@@ -64,8 +428,8 @@ static cl::opt<unsigned>
/// br i1 %cmp6.not, label %if.end12, label %if.then8
///
/// if.then8: ; preds = %if.end
-/// %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 -9223372036854775808
-/// br label %cleanup
+/// %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64
+/// -9223372036854775808 br label %cleanup
///
/// if.end12: ; preds = %if.end
/// %cmp13 = icmp ult i64 %shr, 150
@@ -83,9 +447,10 @@ static cl::opt<unsigned>
/// %mul19 = mul nsw i64 %shl, %conv
/// br label %cleanup
///
-/// cleanup: ; preds = %entry, %if.else, %if.then15, %if.then8
-/// %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ %mul19, %if.else ], [ 0, %entry ]
-/// ret i64 %retval.0
+/// cleanup: ; preds = %entry,
+/// %if.else, %if.then15, %if.then8
+/// %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [
+/// %mul19, %if.else ], [ 0, %entry ] ret i64 %retval.0
/// }
///
/// Replace fp to integer with generated code.
@@ -272,13 +637,11 @@ static void expandFPToI(Instruction *FPToI) {
/// %or = or i64 %shr6, %conv11
/// br label %sw.epilog
///
-/// sw.epilog: ; preds = %sw.default, %if.then4, %sw.bb
-/// %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, %sw.bb ]
-/// %1 = lshr i64 %a.addr.0, 2
-/// %2 = and i64 %1, 1
-/// %or16 = or i64 %2, %a.addr.0
-/// %inc = add nsw i64 %or16, 1
-/// %3 = and i64 %inc, 67108864
+/// sw.epilog: ; preds = %sw.default,
+/// %if.then4, %sw.bb
+/// %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl,
+/// %sw.bb ] %1 = lshr i64 %a.addr.0, 2 %2 = and i64 %1, 1 %or16 = or i64 %2,
+/// %a.addr.0 %inc = add nsw i64 %or16, 1 %3 = and i64 %inc, 67108864
/// %tobool.not = icmp eq i64 %3, 0
/// %spec.select.v = select i1 %tobool.not, i64 2, i64 3
/// %spec.select = ashr i64 %inc, %spec.select.v
@@ -291,7 +654,8 @@ static void expandFPToI(Instruction *FPToI) {
/// %shl25 = shl i64 %sub, %sh_prom24
/// br label %if.end26
///
-/// if.end26: ; preds = %sw.epilog, %if.else
+/// if.end26: ; preds = %sw.epilog,
+/// %if.else
/// %a.addr.1 = phi i64 [ %shl25, %if.else ], [ %spec.select, %sw.epilog ]
/// %e.0 = phi i32 [ %sub2, %if.else ], [ %spec.select56, %sw.epilog ]
/// %conv27 = trunc i64 %shr to i32
@@ -305,7 +669,8 @@ static void expandFPToI(Instruction *FPToI) {
/// %4 = bitcast i32 %or33 to float
/// br label %return
///
-/// return: ; preds = %entry, %if.end26
+/// return: ; preds = %entry,
+/// %if.end26
/// %retval.0 = phi float [ %4, %if.end26 ], [ 0.000000e+00, %entry ]
/// ret float %retval.0
/// }
@@ -594,7 +959,38 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
I->eraseFromParent();
}
-static bool runImpl(Function &F, const TargetLowering &TLI) {
+// This covers all floating point types; more than we need here.
+// TODO Move somewhere else for general use?
+/// Return the Libcall for a frem instruction of
+/// type \p Ty.
+static RTLIB::Libcall fremToLibcall(Type *Ty) {
+ assert(Ty->isFloatingPointTy());
+ if (Ty->isFloatTy() || Ty->is16bitFPTy())
+ return RTLIB::REM_F32;
+ if (Ty->isDoubleTy())
+ return RTLIB::REM_F64;
+ if (Ty->isFP128Ty())
+ return RTLIB::REM_F128;
+ if (Ty->isX86_FP80Ty())
+ return RTLIB::REM_F80;
+ if (Ty->isPPC_FP128Ty())
+ return RTLIB::REM_PPCF128;
+
+ llvm_unreachable("Unknown floating point type");
+}
+
+/* Return true if, according to \p LibInfo, the target either directly
+ supports the frem instruction for the \p Ty, has a custom lowering,
+ or uses a libcall. */
+static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) {
+ if (!TLI.isOperationExpand(ISD::FREM, EVT::getEVT(Ty)))
+ return true;
+
+ return TLI.getLibcallName(fremToLibcall(Ty->getScalarType()));
+}
+
+static bool runImpl(Function &F, const TargetLowering &TLI,
+ AssumptionCache *AC) {
SmallVector<Instruction *, 4> Replace;
SmallVector<Instruction *, 4> ReplaceVector;
bool Modified = false;
@@ -609,6 +1005,21 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
for (auto &I : instructions(F)) {
switch (I.getOpcode()) {
+ case Instruction::FRem: {
+ Type *Ty = I.getType();
+ // TODO: This pass doesn't handle scalable vectors.
+ if (Ty->isScalableTy())
+ continue;
+
+ if (targetSupportsFrem(TLI, Ty) ||
+ !FRemExpander::canExpandType(Ty->getScalarType()))
+ continue;
+
+ Replace.push_back(&I);
+ Modified = true;
+
+ break;
+ }
case Instruction::FPToUI:
case Instruction::FPToSI: {
// TODO: This pass doesn't handle scalable vectors.
@@ -659,8 +1070,20 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
while (!Replace.empty()) {
Instruction *I = Replace.pop_back_val();
- if (I->getOpcode() == Instruction::FPToUI ||
- I->getOpcode() == Instruction::FPToSI) {
+ if (I->getOpcode() == Instruction::FRem) {
+ auto SQ = [&]() -> std::optional<SimplifyQuery> {
+ if (AC) {
+ auto Res = std::make_optional<SimplifyQuery>(
+ I->getModule()->getDataLayout(), I);
+ Res->AC = AC;
+ return Res;
+ }
+ return {};
+ }();
+
+ expandFRem(cast<BinaryOperator>(*I), SQ);
+ } else if (I->getOpcode() == Instruction::FPToUI ||
+ I->getOpcode() == Instruction::FPToSI) {
expandFPToI(I);
} else {
expandIToFP(I);
@@ -672,31 +1095,58 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
namespace {
class ExpandFpLegacyPass : public FunctionPass {
+ CodeGenOptLevel OptLevel;
+
public:
static char ID;
- ExpandFpLegacyPass() : FunctionPass(ID) {
+ ExpandFpLegacyPass(CodeGenOptLevel OptLevel)
+ : FunctionPass(ID), OptLevel(OptLevel) {
initializeExpandFpLegacyPassPass(*PassRegistry::getPassRegistry());
}
+ ExpandFpLegacyPass() : ExpandFpLegacyPass(CodeGenOptLevel::None) {};
+
bool runOnFunction(Function &F) override {
auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
- return runImpl(F, *TLI);
+ AssumptionCache *AC = nullptr;
+
+ if (OptLevel != CodeGenOptLevel::None && !F.hasOptNone())
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ return runImpl(F, *TLI, AC);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
+ if (OptLevel != CodeGenOptLevel::None)
+ AU.addRequired<AssumptionCacheTracker>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
} // namespace
+ExpandFpPass::ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel)
+ : TM(TM), OptLevel(OptLevel) {}
+
+void ExpandFpPass::printPipeline(
+ raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+ static_cast<PassInfoMixin<ExpandFpPass> *>(this)->printPipeline(
+ OS, MapClassName2PassName);
+ OS << '<';
+ OS << "O" << (int)OptLevel;
+ OS << '>';
+}
+
PreservedAnalyses ExpandFpPass::run(Function &F, FunctionAnalysisManager &FAM) {
const TargetSubtargetInfo *STI = TM->getSubtargetImpl(F);
- return runImpl(F, *STI->getTargetLowering()) ? PreservedAnalyses::none()
- : PreservedAnalyses::all();
+ auto &TLI = *STI->getTargetLowering();
+ AssumptionCache *AC = nullptr;
+ if (OptLevel != CodeGenOptLevel::None)
+ AC = &FAM.getResult<AssumptionAnalysis>(F);
+ return runImpl(F, TLI, AC) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
}
char ExpandFpLegacyPass::ID = 0;
@@ -704,4 +1154,6 @@ INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp",
"Expand certain fp instructions", false, false)
INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp", "Expand fp", false, false)
-FunctionPass *llvm::createExpandFpPass() { return new ExpandFpLegacyPass(); }
+FunctionPass *llvm::createExpandFpPass(CodeGenOptLevel OptLevel) {
+ return new ExpandFpLegacyPass(OptLevel);
+}
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 753c65600770..03abc042e556 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -150,9 +150,8 @@ struct CachingVPExpander {
ElementCount ElemCount);
/// If needed, folds the EVL in the mask operand and discards the EVL
- /// parameter. Returns a pair of the value of the intrinsic after the change
- /// (if any) and whether the mask was actually folded.
- std::pair<Value *, bool> foldEVLIntoMask(VPIntrinsic &VPI);
+ /// parameter. Returns true if the mask was actually folded.
+ bool foldEVLIntoMask(VPIntrinsic &VPI);
/// "Remove" the %evl parameter of \p PI by setting it to the static vector
/// length of the operation. Returns true if the %evl (if any) was effectively
@@ -160,34 +159,31 @@ struct CachingVPExpander {
bool discardEVLParameter(VPIntrinsic &PI);
/// Lower this VP binary operator to a unpredicated binary operator.
- Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder,
- VPIntrinsic &PI);
+ bool expandPredicationInBinaryOperator(IRBuilder<> &Builder, VPIntrinsic &PI);
/// Lower this VP int call to a unpredicated int call.
- Value *expandPredicationToIntCall(IRBuilder<> &Builder, VPIntrinsic &PI);
+ bool expandPredicationToIntCall(IRBuilder<> &Builder, VPIntrinsic &PI);
/// Lower this VP fp call to a unpredicated fp call.
- Value *expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI,
- unsigned UnpredicatedIntrinsicID);
+ bool expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI,
+ unsigned UnpredicatedIntrinsicID);
/// Lower this VP reduction to a call to an unpredicated reduction intrinsic.
- Value *expandPredicationInReduction(IRBuilder<> &Builder,
- VPReductionIntrinsic &PI);
+ bool expandPredicationInReduction(IRBuilder<> &Builder,
+ VPReductionIntrinsic &PI);
/// Lower this VP cast operation to a non-VP intrinsic.
- Value *expandPredicationToCastIntrinsic(IRBuilder<> &Builder,
- VPIntrinsic &VPI);
+ bool expandPredicationToCastIntrinsic(IRBuilder<> &Builder, VPIntrinsic &VPI);
/// Lower this VP memory operation to a non-VP intrinsic.
- Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
- VPIntrinsic &VPI);
+ bool expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+ VPIntrinsic &VPI);
/// Lower this VP comparison to a call to an unpredicated comparison.
- Value *expandPredicationInComparison(IRBuilder<> &Builder,
- VPCmpIntrinsic &PI);
+ bool expandPredicationInComparison(IRBuilder<> &Builder, VPCmpIntrinsic &PI);
/// Query TTI and expand the vector predication in \p P accordingly.
- Value *expandPredication(VPIntrinsic &PI);
+ bool expandPredication(VPIntrinsic &PI);
/// Determine how and whether the VPIntrinsic \p VPI shall be expanded. This
/// overrides TTI with the cl::opts listed at the top of this file.
@@ -227,9 +223,8 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder,
return Builder.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat);
}
-Value *
-CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
- VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
+ VPIntrinsic &VPI) {
assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
"Implicitly dropping %evl in non-speculatable operator!");
@@ -261,14 +256,14 @@ CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
Value *NewBinOp = Builder.CreateBinOp(OC, Op0, Op1, VPI.getName());
replaceOperation(*NewBinOp, VPI);
- return NewBinOp;
+ return true;
}
-Value *CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder,
- VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder,
+ VPIntrinsic &VPI) {
std::optional<unsigned> FID = VPI.getFunctionalIntrinsicID();
if (!FID)
- return nullptr;
+ return false;
SmallVector<Value *, 2> Argument;
for (unsigned i = 0; i < VPI.getNumOperands() - 3; i++) {
Argument.push_back(VPI.getOperand(i));
@@ -276,10 +271,10 @@ Value *CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder,
Value *NewOp = Builder.CreateIntrinsic(FID.value(), {VPI.getType()}, Argument,
/*FMFSource=*/nullptr, VPI.getName());
replaceOperation(*NewOp, VPI);
- return NewOp;
+ return true;
}
-Value *CachingVPExpander::expandPredicationToFPCall(
+bool CachingVPExpander::expandPredicationToFPCall(
IRBuilder<> &Builder, VPIntrinsic &VPI, unsigned UnpredicatedIntrinsicID) {
assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
"Implicitly dropping %evl in non-speculatable operator!");
@@ -297,7 +292,7 @@ Value *CachingVPExpander::expandPredicationToFPCall(
UnpredicatedIntrinsicID, {VPI.getType()}, Argument,
/*FMFSource=*/nullptr, VPI.getName());
replaceOperation(*NewOp, VPI);
- return NewOp;
+ return true;
}
case Intrinsic::fma:
case Intrinsic::fmuladd:
@@ -315,11 +310,11 @@ Value *CachingVPExpander::expandPredicationToFPCall(
else
NewOp = Builder.CreateCall(Fn, {Op0, Op1, Op2}, VPI.getName());
replaceOperation(*NewOp, VPI);
- return NewOp;
+ return true;
}
}
- return nullptr;
+ return false;
}
static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
@@ -331,9 +326,8 @@ static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
return getReductionIdentity(RdxID, EltTy, FMF);
}
-Value *
-CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
- VPReductionIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationInReduction(
+ IRBuilder<> &Builder, VPReductionIntrinsic &VPI) {
assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
"Implicitly dropping %evl in non-speculatable operator!");
@@ -391,11 +385,11 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
}
replaceOperation(*Reduction, VPI);
- return Reduction;
+ return true;
}
-Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder,
- VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder,
+ VPIntrinsic &VPI) {
Intrinsic::ID VPID = VPI.getIntrinsicID();
unsigned CastOpcode = VPIntrinsic::getFunctionalOpcodeForVP(VPID).value();
assert(Instruction::isCast(CastOpcode));
@@ -404,12 +398,11 @@ Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder,
VPI.getType(), VPI.getName());
replaceOperation(*CastOp, VPI);
- return CastOp;
+ return true;
}
-Value *
-CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
- VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+ VPIntrinsic &VPI) {
assert(VPI.canIgnoreVectorLengthParam());
const auto &DL = VPI.getDataLayout();
@@ -469,11 +462,11 @@ CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
assert(NewMemoryInst);
replaceOperation(*NewMemoryInst, VPI);
- return NewMemoryInst;
+ return true;
}
-Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder,
- VPCmpIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder,
+ VPCmpIntrinsic &VPI) {
assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
"Implicitly dropping %evl in non-speculatable operator!");
@@ -487,7 +480,7 @@ Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder,
auto *NewCmp = Builder.CreateCmp(Pred, Op0, Op1);
replaceOperation(*NewCmp, VPI);
- return NewCmp;
+ return true;
}
bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
@@ -516,17 +509,24 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
return true;
}
-std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
+bool CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n');
IRBuilder<> Builder(&VPI);
// Ineffective %evl parameter and so nothing to do here.
if (VPI.canIgnoreVectorLengthParam())
- return {&VPI, false};
+ return false;
// Only VP intrinsics can have an %evl parameter.
Value *OldMaskParam = VPI.getMaskParam();
+ if (!OldMaskParam) {
+ assert((VPI.getIntrinsicID() == Intrinsic::vp_merge ||
+ VPI.getIntrinsicID() == Intrinsic::vp_select) &&
+ "Unexpected VP intrinsic without mask operand");
+ OldMaskParam = VPI.getArgOperand(0);
+ }
+
Value *OldEVLParam = VPI.getVectorLengthParam();
assert(OldMaskParam && "no mask param to fold the vl param into");
assert(OldEVLParam && "no EVL param to fold away");
@@ -538,7 +538,11 @@ std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
ElementCount ElemCount = VPI.getStaticVectorLength();
Value *VLMask = convertEVLToMask(Builder, OldEVLParam, ElemCount);
Value *NewMaskParam = Builder.CreateAnd(VLMask, OldMaskParam);
- VPI.setMaskParam(NewMaskParam);
+ if (VPI.getIntrinsicID() == Intrinsic::vp_merge ||
+ VPI.getIntrinsicID() == Intrinsic::vp_select)
+ VPI.setArgOperand(0, NewMaskParam);
+ else
+ VPI.setMaskParam(NewMaskParam);
// Drop the %evl parameter.
discardEVLParameter(VPI);
@@ -546,10 +550,10 @@ std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
"transformation did not render the evl param ineffective!");
// Reassess the modified instruction.
- return {&VPI, true};
+ return true;
}
-Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
LLVM_DEBUG(dbgs() << "Lowering to unpredicated op: " << VPI << '\n');
IRBuilder<> Builder(&VPI);
@@ -566,9 +570,8 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
if (auto *VPCmp = dyn_cast<VPCmpIntrinsic>(&VPI))
return expandPredicationInComparison(Builder, *VPCmp);
- if (VPCastIntrinsic::isVPCast(VPI.getIntrinsicID())) {
+ if (VPCastIntrinsic::isVPCast(VPI.getIntrinsicID()))
return expandPredicationToCastIntrinsic(Builder, VPI);
- }
switch (VPI.getIntrinsicID()) {
default:
@@ -578,6 +581,14 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
replaceOperation(*NewNegOp, VPI);
return NewNegOp;
}
+ case Intrinsic::vp_select:
+ case Intrinsic::vp_merge: {
+ assert(maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam());
+ Value *NewSelectOp = Builder.CreateSelect(
+ VPI.getOperand(0), VPI.getOperand(1), VPI.getOperand(2), VPI.getName());
+ replaceOperation(*NewSelectOp, VPI);
+ return NewSelectOp;
+ }
case Intrinsic::vp_abs:
case Intrinsic::vp_smax:
case Intrinsic::vp_smin:
@@ -613,10 +624,10 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
}
if (auto CID = VPI.getConstrainedIntrinsicID())
- if (Value *Call = expandPredicationToFPCall(Builder, VPI, *CID))
- return Call;
+ if (expandPredicationToFPCall(Builder, VPI, *CID))
+ return true;
- return &VPI;
+ return false;
}
//// } CachingVPExpander
@@ -673,8 +684,7 @@ CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
Changed = VPExpansionDetails::IntrinsicUpdated;
break;
case VPLegalization::Convert:
- if (auto [NewVPI, Folded] = foldEVLIntoMask(VPI); Folded) {
- (void)NewVPI;
+ if (foldEVLIntoMask(VPI)) {
Changed = VPExpansionDetails::IntrinsicUpdated;
++NumFoldedVL;
}
@@ -688,7 +698,7 @@ CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
case VPLegalization::Discard:
llvm_unreachable("Invalid strategy for operators.");
case VPLegalization::Convert:
- if (Value *V = expandPredication(VPI); V != &VPI) {
+ if (expandPredication(VPI)) {
++NumLoweredVPOps;
Changed = VPExpansionDetails::IntrinsicReplaced;
}
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 90a18b86c1b1..b3c312569736 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1256,7 +1256,7 @@ LLT CallLowering::ValueHandler::getStackValueStoreType(
if (Flags.isPointer()) {
LLT PtrTy = LLT::pointer(Flags.getPointerAddrSpace(),
ValTy.getScalarSizeInBits());
- if (ValVT.isVector())
+ if (ValVT.isVector() && ValVT.getVectorNumElements() != 1)
return LLT::vector(ValTy.getElementCount(), PtrTy);
return PtrTy;
}
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 0674f5fd1ae0..0ebee2cfd868 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2094,6 +2094,68 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI,
return true;
}
+bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI,
+ LshrOfTruncOfLshr &MatchInfo,
+ MachineInstr &ShiftMI) const {
+ assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR");
+
+ Register N0 = MI.getOperand(1).getReg();
+ Register N1 = MI.getOperand(2).getReg();
+ unsigned OpSizeInBits = MRI.getType(N0).getScalarSizeInBits();
+
+ APInt N1C, N001C;
+ if (!mi_match(N1, MRI, m_ICstOrSplat(N1C)))
+ return false;
+ auto N001 = ShiftMI.getOperand(2).getReg();
+ if (!mi_match(N001, MRI, m_ICstOrSplat(N001C)))
+ return false;
+
+ if (N001C.getBitWidth() > N1C.getBitWidth())
+ N1C = N1C.zext(N001C.getBitWidth());
+ else
+ N001C = N001C.zext(N1C.getBitWidth());
+
+ Register InnerShift = ShiftMI.getOperand(0).getReg();
+ LLT InnerShiftTy = MRI.getType(InnerShift);
+ uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits();
+ if ((N1C + N001C).ult(InnerShiftSize)) {
+ MatchInfo.Src = ShiftMI.getOperand(1).getReg();
+ MatchInfo.ShiftAmt = N1C + N001C;
+ MatchInfo.ShiftAmtTy = MRI.getType(N001);
+ MatchInfo.InnerShiftTy = InnerShiftTy;
+
+ if ((N001C + OpSizeInBits) == InnerShiftSize)
+ return true;
+ if (MRI.hasOneUse(N0) && MRI.hasOneUse(InnerShift)) {
+ MatchInfo.Mask = true;
+ MatchInfo.MaskVal = APInt(N1C.getBitWidth(), OpSizeInBits) - N1C;
+ return true;
+ }
+ }
+ return false;
+}
+
+void CombinerHelper::applyLshrOfTruncOfLshr(
+ MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo) const {
+ assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR");
+
+ Register Dst = MI.getOperand(0).getReg();
+ auto ShiftAmt =
+ Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt);
+ auto Shift =
+ Builder.buildLShr(MatchInfo.InnerShiftTy, MatchInfo.Src, ShiftAmt);
+ if (MatchInfo.Mask == true) {
+ APInt MaskVal =
+ APInt::getLowBitsSet(MatchInfo.InnerShiftTy.getScalarSizeInBits(),
+ MatchInfo.MaskVal.getZExtValue());
+ auto Mask = Builder.buildConstant(MatchInfo.InnerShiftTy, MaskVal);
+ auto And = Builder.buildAnd(MatchInfo.InnerShiftTy, Shift, Mask);
+ Builder.buildTrunc(Dst, And);
+ } else
+ Builder.buildTrunc(Dst, Shift);
+ MI.eraseFromParent();
+}
+
bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
unsigned &ShiftVal) const {
assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 008c18837a52..b02465d99a60 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2916,6 +2916,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
case TargetOpcode::G_SREM:
case TargetOpcode::G_SMIN:
case TargetOpcode::G_SMAX:
+ case TargetOpcode::G_ABDS:
Observer.changingInstr(MI);
widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
@@ -2953,6 +2954,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
return Legalized;
case TargetOpcode::G_UDIV:
case TargetOpcode::G_UREM:
+ case TargetOpcode::G_ABDU:
Observer.changingInstr(MI);
widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
@@ -4742,6 +4744,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
return lowerShlSat(MI);
case G_ABS:
return lowerAbsToAddXor(MI);
+ case G_ABDS:
+ case G_ABDU: {
+ bool IsSigned = MI.getOpcode() == G_ABDS;
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ if ((IsSigned && LI.isLegal({G_SMIN, Ty}) && LI.isLegal({G_SMAX, Ty})) ||
+ (!IsSigned && LI.isLegal({G_UMIN, Ty}) && LI.isLegal({G_UMAX, Ty}))) {
+ return lowerAbsDiffToMinMax(MI);
+ }
+ return lowerAbsDiffToSelect(MI);
+ }
case G_FABS:
return lowerFAbs(MI);
case G_SELECT:
@@ -4773,6 +4785,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
return lowerVectorReduction(MI);
case G_VAARG:
return lowerVAArg(MI);
+ case G_ATOMICRMW_SUB: {
+ auto [Ret, Mem, Val] = MI.getFirst3Regs();
+ const LLT ValTy = MRI.getType(Val);
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ auto VNeg = MIRBuilder.buildNeg(ValTy, Val);
+ MIRBuilder.buildAtomicRMW(G_ATOMICRMW_ADD, Ret, Mem, VNeg, *MMO);
+ MI.eraseFromParent();
+ return Legalized;
+ }
}
}
@@ -5222,19 +5244,13 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
InsertVal = MI.getOperand(2).getReg();
Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
-
- // TODO: Handle total scalarization case.
- if (!NarrowVecTy.isVector())
- return UnableToLegalize;
-
LLT VecTy = MRI.getType(SrcVec);
// If the index is a constant, we can really break this down as you would
// expect, and index into the target size pieces.
- int64_t IdxVal;
auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
if (MaybeCst) {
- IdxVal = MaybeCst->Value.getSExtValue();
+ uint64_t IdxVal = MaybeCst->Value.getZExtValue();
// Avoid out of bounds indexing the pieces.
if (IdxVal >= VecTy.getNumElements()) {
MIRBuilder.buildUndef(DstReg);
@@ -5242,33 +5258,45 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
return Legalized;
}
- SmallVector<Register, 8> VecParts;
- LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
+ if (!NarrowVecTy.isVector()) {
+ SmallVector<Register, 8> SplitPieces;
+ extractParts(MI.getOperand(1).getReg(), NarrowVecTy,
+ VecTy.getNumElements(), SplitPieces, MIRBuilder, MRI);
+ if (IsInsert) {
+ SplitPieces[IdxVal] = InsertVal;
+ MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), SplitPieces);
+ } else {
+ MIRBuilder.buildCopy(MI.getOperand(0).getReg(), SplitPieces[IdxVal]);
+ }
+ } else {
+ SmallVector<Register, 8> VecParts;
+ LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
- // Build a sequence of NarrowTy pieces in VecParts for this operand.
- LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
- TargetOpcode::G_ANYEXT);
+ // Build a sequence of NarrowTy pieces in VecParts for this operand.
+ LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
+ TargetOpcode::G_ANYEXT);
- unsigned NewNumElts = NarrowVecTy.getNumElements();
+ unsigned NewNumElts = NarrowVecTy.getNumElements();
- LLT IdxTy = MRI.getType(Idx);
- int64_t PartIdx = IdxVal / NewNumElts;
- auto NewIdx =
- MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
+ LLT IdxTy = MRI.getType(Idx);
+ int64_t PartIdx = IdxVal / NewNumElts;
+ auto NewIdx =
+ MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
- if (IsInsert) {
- LLT PartTy = MRI.getType(VecParts[PartIdx]);
+ if (IsInsert) {
+ LLT PartTy = MRI.getType(VecParts[PartIdx]);
- // Use the adjusted index to insert into one of the subvectors.
- auto InsertPart = MIRBuilder.buildInsertVectorElement(
- PartTy, VecParts[PartIdx], InsertVal, NewIdx);
- VecParts[PartIdx] = InsertPart.getReg(0);
+ // Use the adjusted index to insert into one of the subvectors.
+ auto InsertPart = MIRBuilder.buildInsertVectorElement(
+ PartTy, VecParts[PartIdx], InsertVal, NewIdx);
+ VecParts[PartIdx] = InsertPart.getReg(0);
- // Recombine the inserted subvector with the others to reform the result
- // vector.
- buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
- } else {
- MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
+ // Recombine the inserted subvector with the others to reform the result
+ // vector.
+ buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
+ } else {
+ MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
+ }
}
MI.eraseFromParent();
@@ -5970,7 +5998,6 @@ LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
return Legalized;
}
-// TODO: Optimize if constant shift amount.
LegalizerHelper::LegalizeResult
LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
LLT RequestedTy) {
@@ -5992,6 +6019,27 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
if (DstEltSize % 2 != 0)
return UnableToLegalize;
+ // Check if we should use multi-way splitting instead of recursive binary
+ // splitting.
+ //
+ // Multi-way splitting directly decomposes wide shifts (e.g., 128-bit ->
+ // 4×32-bit) in a single legalization step, avoiding the recursive overhead
+ // and dependency chains created by usual binary splitting approach
+ // (128->64->32).
+ //
+ // The >= 8 parts threshold ensures we only use this optimization when binary
+ // splitting would require multiple recursive passes, avoiding overhead for
+ // simple 2-way splits where binary approach is sufficient.
+ if (RequestedTy.isValid() && RequestedTy.isScalar() &&
+ DstEltSize % RequestedTy.getSizeInBits() == 0) {
+ const unsigned NumParts = DstEltSize / RequestedTy.getSizeInBits();
+ // Use multiway if we have 8 or more parts (i.e., would need 3+ recursive
+ // steps).
+ if (NumParts >= 8)
+ return narrowScalarShiftMultiway(MI, RequestedTy);
+ }
+
+ // Fall back to binary splitting:
// Ignore the input type. We can only go to exactly half the size of the
// input. If that isn't small enough, the resulting pieces will be further
// legalized.
@@ -6080,6 +6128,358 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
return Legalized;
}
+Register LegalizerHelper::buildConstantShiftPart(unsigned Opcode,
+ unsigned PartIdx,
+ unsigned NumParts,
+ ArrayRef<Register> SrcParts,
+ const ShiftParams &Params,
+ LLT TargetTy, LLT ShiftAmtTy) {
+ auto WordShiftConst = getIConstantVRegVal(Params.WordShift, MRI);
+ auto BitShiftConst = getIConstantVRegVal(Params.BitShift, MRI);
+ assert(WordShiftConst && BitShiftConst && "Expected constants");
+
+ const unsigned ShiftWords = WordShiftConst->getZExtValue();
+ const unsigned ShiftBits = BitShiftConst->getZExtValue();
+ const bool NeedsInterWordShift = ShiftBits != 0;
+
+ switch (Opcode) {
+ case TargetOpcode::G_SHL: {
+ // Data moves from lower indices to higher indices
+ // If this part would come from a source beyond our range, it's zero
+ if (PartIdx < ShiftWords)
+ return Params.Zero;
+
+ unsigned SrcIdx = PartIdx - ShiftWords;
+ if (!NeedsInterWordShift)
+ return SrcParts[SrcIdx];
+
+ // Combine shifted main part with carry from previous part
+ auto Hi = MIRBuilder.buildShl(TargetTy, SrcParts[SrcIdx], Params.BitShift);
+ if (SrcIdx > 0) {
+ auto Lo = MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx - 1],
+ Params.InvBitShift);
+ return MIRBuilder.buildOr(TargetTy, Hi, Lo).getReg(0);
+ }
+ return Hi.getReg(0);
+ }
+
+ case TargetOpcode::G_LSHR: {
+ unsigned SrcIdx = PartIdx + ShiftWords;
+ if (SrcIdx >= NumParts)
+ return Params.Zero;
+ if (!NeedsInterWordShift)
+ return SrcParts[SrcIdx];
+
+ // Combine shifted main part with carry from next part
+ auto Lo = MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx], Params.BitShift);
+ if (SrcIdx + 1 < NumParts) {
+ auto Hi = MIRBuilder.buildShl(TargetTy, SrcParts[SrcIdx + 1],
+ Params.InvBitShift);
+ return MIRBuilder.buildOr(TargetTy, Lo, Hi).getReg(0);
+ }
+ return Lo.getReg(0);
+ }
+
+ case TargetOpcode::G_ASHR: {
+ // Like LSHR but preserves sign bit
+ unsigned SrcIdx = PartIdx + ShiftWords;
+ if (SrcIdx >= NumParts)
+ return Params.SignBit;
+ if (!NeedsInterWordShift)
+ return SrcParts[SrcIdx];
+
+ // Only the original MSB part uses arithmetic shift to preserve sign. All
+ // other parts use logical shift since they're just moving data bits.
+ auto Lo =
+ (SrcIdx == NumParts - 1)
+ ? MIRBuilder.buildAShr(TargetTy, SrcParts[SrcIdx], Params.BitShift)
+ : MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx], Params.BitShift);
+ Register HiSrc =
+ (SrcIdx + 1 < NumParts) ? SrcParts[SrcIdx + 1] : Params.SignBit;
+ auto Hi = MIRBuilder.buildShl(TargetTy, HiSrc, Params.InvBitShift);
+ return MIRBuilder.buildOr(TargetTy, Lo, Hi).getReg(0);
+ }
+
+ default:
+ llvm_unreachable("not a shift");
+ }
+}
+
+Register LegalizerHelper::buildVariableShiftPart(unsigned Opcode,
+ Register MainOperand,
+ Register ShiftAmt,
+ LLT TargetTy,
+ Register CarryOperand) {
+ // This helper generates a single output part for variable shifts by combining
+ // the main operand (shifted by BitShift) with carry bits from an adjacent
+ // part.
+
+ // For G_ASHR, individual parts don't have their own sign bit, only the
+ // complete value does. So we use LSHR for the main operand shift in ASHR
+ // context.
+ unsigned MainOpcode =
+ (Opcode == TargetOpcode::G_ASHR) ? TargetOpcode::G_LSHR : Opcode;
+
+ // Perform the primary shift on the main operand
+ Register MainShifted =
+ MIRBuilder.buildInstr(MainOpcode, {TargetTy}, {MainOperand, ShiftAmt})
+ .getReg(0);
+
+ // No carry operand available
+ if (!CarryOperand.isValid())
+ return MainShifted;
+
+ // If BitShift is 0 (word-aligned shift), no inter-word bit movement occurs,
+ // so carry bits aren't needed.
+ LLT ShiftAmtTy = MRI.getType(ShiftAmt);
+ auto ZeroConst = MIRBuilder.buildConstant(ShiftAmtTy, 0);
+ LLT BoolTy = LLT::scalar(1);
+ auto IsZeroBitShift =
+ MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, ShiftAmt, ZeroConst);
+
+ // Extract bits from the adjacent part that will "carry over" into this part.
+ // The carry direction is opposite to the main shift direction, so we can
+ // align the two shifted values before combining them with OR.
+
+ // Determine the carry shift opcode (opposite direction)
+ unsigned CarryOpcode = (Opcode == TargetOpcode::G_SHL) ? TargetOpcode::G_LSHR
+ : TargetOpcode::G_SHL;
+
+ // Calculate inverse shift amount: BitWidth - ShiftAmt
+ auto TargetBitsConst =
+ MIRBuilder.buildConstant(ShiftAmtTy, TargetTy.getScalarSizeInBits());
+ auto InvShiftAmt = MIRBuilder.buildSub(ShiftAmtTy, TargetBitsConst, ShiftAmt);
+
+ // Shift the carry operand
+ Register CarryBits =
+ MIRBuilder
+ .buildInstr(CarryOpcode, {TargetTy}, {CarryOperand, InvShiftAmt})
+ .getReg(0);
+
+ // If BitShift is 0, don't include carry bits (InvShiftAmt would equal
+ // TargetBits which would be poison for the individual carry shift operation).
+ auto ZeroReg = MIRBuilder.buildConstant(TargetTy, 0);
+ Register SafeCarryBits =
+ MIRBuilder.buildSelect(TargetTy, IsZeroBitShift, ZeroReg, CarryBits)
+ .getReg(0);
+
+ // Combine the main shifted part with the carry bits
+ return MIRBuilder.buildOr(TargetTy, MainShifted, SafeCarryBits).getReg(0);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarShiftByConstantMultiway(MachineInstr &MI,
+ const APInt &Amt,
+ LLT TargetTy,
+ LLT ShiftAmtTy) {
+ // Any wide shift can be decomposed into WordShift + BitShift components.
+ // When shift amount is known constant, directly compute the decomposition
+ // values and generate constant registers.
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+
+ const unsigned DstBits = DstTy.getScalarSizeInBits();
+ const unsigned TargetBits = TargetTy.getScalarSizeInBits();
+ const unsigned NumParts = DstBits / TargetBits;
+
+ assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
+
+ // When the shift amount is known at compile time, we just calculate which
+ // source parts contribute to each output part.
+
+ SmallVector<Register, 8> SrcParts;
+ extractParts(SrcReg, TargetTy, NumParts, SrcParts, MIRBuilder, MRI);
+
+ if (Amt.isZero()) {
+ // No shift needed, just copy
+ MIRBuilder.buildMergeLikeInstr(DstReg, SrcParts);
+ MI.eraseFromParent();
+ return Legalized;
+ }
+
+ ShiftParams Params;
+ const unsigned ShiftWords = Amt.getZExtValue() / TargetBits;
+ const unsigned ShiftBits = Amt.getZExtValue() % TargetBits;
+
+ // Generate constants and values needed by all shift types
+ Params.WordShift = MIRBuilder.buildConstant(ShiftAmtTy, ShiftWords).getReg(0);
+ Params.BitShift = MIRBuilder.buildConstant(ShiftAmtTy, ShiftBits).getReg(0);
+ Params.InvBitShift =
+ MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - ShiftBits).getReg(0);
+ Params.Zero = MIRBuilder.buildConstant(TargetTy, 0).getReg(0);
+
+ // For ASHR, we need the sign-extended value to fill shifted-out positions
+ if (MI.getOpcode() == TargetOpcode::G_ASHR)
+ Params.SignBit =
+ MIRBuilder
+ .buildAShr(TargetTy, SrcParts[SrcParts.size() - 1],
+ MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1))
+ .getReg(0);
+
+ SmallVector<Register, 8> DstParts(NumParts);
+ for (unsigned I = 0; I < NumParts; ++I)
+ DstParts[I] = buildConstantShiftPart(MI.getOpcode(), I, NumParts, SrcParts,
+ Params, TargetTy, ShiftAmtTy);
+
+ MIRBuilder.buildMergeLikeInstr(DstReg, DstParts);
+ MI.eraseFromParent();
+ return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarShiftMultiway(MachineInstr &MI, LLT TargetTy) {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register AmtReg = MI.getOperand(2).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ LLT ShiftAmtTy = MRI.getType(AmtReg);
+
+ const unsigned DstBits = DstTy.getScalarSizeInBits();
+ const unsigned TargetBits = TargetTy.getScalarSizeInBits();
+ const unsigned NumParts = DstBits / TargetBits;
+
+ assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
+ assert(isPowerOf2_32(TargetBits) && "Target bit width must be power of 2");
+
+ // If the shift amount is known at compile time, we can use direct indexing
+ // instead of generating select chains in the general case.
+ if (auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI))
+ return narrowScalarShiftByConstantMultiway(MI, VRegAndVal->Value, TargetTy,
+ ShiftAmtTy);
+
+ // For runtime-variable shift amounts, we must generate a more complex
+ // sequence that handles all possible shift values using select chains.
+
+ // Split the input into target-sized pieces
+ SmallVector<Register, 8> SrcParts;
+ extractParts(SrcReg, TargetTy, NumParts, SrcParts, MIRBuilder, MRI);
+
+ // Shifting by zero should be a no-op.
+ auto ZeroAmtConst = MIRBuilder.buildConstant(ShiftAmtTy, 0);
+ LLT BoolTy = LLT::scalar(1);
+ auto IsZeroShift =
+ MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, AmtReg, ZeroAmtConst);
+
+ // Any wide shift can be decomposed into two components:
+ // 1. WordShift: number of complete target-sized words to shift
+ // 2. BitShift: number of bits to shift within each word
+ //
+ // Example: 128-bit >> 50 with 32-bit target:
+ // WordShift = 50 / 32 = 1 (shift right by 1 complete word)
+ // BitShift = 50 % 32 = 18 (shift each word right by 18 bits)
+ unsigned TargetBitsLog2 = Log2_32(TargetBits);
+ auto TargetBitsLog2Const =
+ MIRBuilder.buildConstant(ShiftAmtTy, TargetBitsLog2);
+ auto TargetBitsMask = MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1);
+
+ Register WordShift =
+ MIRBuilder.buildLShr(ShiftAmtTy, AmtReg, TargetBitsLog2Const).getReg(0);
+ Register BitShift =
+ MIRBuilder.buildAnd(ShiftAmtTy, AmtReg, TargetBitsMask).getReg(0);
+
+ // Fill values:
+ // - SHL/LSHR: fill with zeros
+ // - ASHR: fill with sign-extended MSB
+ Register ZeroReg = MIRBuilder.buildConstant(TargetTy, 0).getReg(0);
+
+ Register FillValue;
+ if (MI.getOpcode() == TargetOpcode::G_ASHR) {
+ auto TargetBitsMinusOneConst =
+ MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1);
+ FillValue = MIRBuilder
+ .buildAShr(TargetTy, SrcParts[NumParts - 1],
+ TargetBitsMinusOneConst)
+ .getReg(0);
+ } else {
+ FillValue = ZeroReg;
+ }
+
+ SmallVector<Register, 8> DstParts(NumParts);
+
+ // For each output part, generate a select chain that chooses the correct
+ // result based on the runtime WordShift value. This handles all possible
+ // word shift amounts by pre-calculating what each would produce.
+ for (unsigned I = 0; I < NumParts; ++I) {
+ // Initialize with appropriate default value for this shift type
+ Register InBoundsResult = FillValue;
+
+ // clang-format off
+ // Build a branchless select chain by pre-computing results for all possible
+ // WordShift values (0 to NumParts-1). Each iteration nests a new select:
+ //
+ // K=0: select(WordShift==0, result0, FillValue)
+ // K=1: select(WordShift==1, result1, select(WordShift==0, result0, FillValue))
+ // K=2: select(WordShift==2, result2, select(WordShift==1, result1, select(...)))
+ // clang-format on
+ for (unsigned K = 0; K < NumParts; ++K) {
+ auto WordShiftKConst = MIRBuilder.buildConstant(ShiftAmtTy, K);
+ auto IsWordShiftK = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy,
+ WordShift, WordShiftKConst);
+
+ // Calculate source indices for this word shift
+ //
+ // For 4-part 128-bit value with K=1 word shift:
+ // SHL: [3][2][1][0] << K => [2][1][0][Z]
+ // -> (MainIdx = I-K, CarryIdx = I-K-1)
+ // LSHR: [3][2][1][0] >> K => [Z][3][2][1]
+ // -> (MainIdx = I+K, CarryIdx = I+K+1)
+ int MainSrcIdx;
+ int CarrySrcIdx; // Index for the word that provides the carried-in bits.
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_SHL:
+ MainSrcIdx = (int)I - (int)K;
+ CarrySrcIdx = MainSrcIdx - 1;
+ break;
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR:
+ MainSrcIdx = (int)I + (int)K;
+ CarrySrcIdx = MainSrcIdx + 1;
+ break;
+ default:
+ llvm_unreachable("Not a shift");
+ }
+
+ // Check bounds and build the result for this word shift
+ Register ResultForK;
+ if (MainSrcIdx >= 0 && MainSrcIdx < (int)NumParts) {
+ Register MainOp = SrcParts[MainSrcIdx];
+ Register CarryOp;
+
+ // Determine carry operand with bounds checking
+ if (CarrySrcIdx >= 0 && CarrySrcIdx < (int)NumParts)
+ CarryOp = SrcParts[CarrySrcIdx];
+ else if (MI.getOpcode() == TargetOpcode::G_ASHR &&
+ CarrySrcIdx >= (int)NumParts)
+ CarryOp = FillValue; // Use sign extension
+
+ ResultForK = buildVariableShiftPart(MI.getOpcode(), MainOp, BitShift,
+ TargetTy, CarryOp);
+ } else {
+ // Out of bounds - use fill value for this k
+ ResultForK = FillValue;
+ }
+
+ // Select this result if WordShift equals k
+ InBoundsResult =
+ MIRBuilder
+ .buildSelect(TargetTy, IsWordShiftK, ResultForK, InBoundsResult)
+ .getReg(0);
+ }
+
+ // Handle zero-shift special case: if shift is 0, use original input
+ DstParts[I] =
+ MIRBuilder
+ .buildSelect(TargetTy, IsZeroShift, SrcParts[I], InBoundsResult)
+ .getReg(0);
+ }
+
+ MIRBuilder.buildMergeLikeInstr(DstReg, DstParts);
+ MI.eraseFromParent();
+ return Legalized;
+}
+
LegalizerHelper::LegalizeResult
LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
LLT MoreTy) {
@@ -9537,6 +9937,54 @@ LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
return Legalized;
}
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAbsDiffToSelect(MachineInstr &MI) {
+ assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
+ MI.getOpcode() == TargetOpcode::G_ABDU) &&
+ "Expected G_ABDS or G_ABDU instruction");
+
+ auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
+ LLT Ty = MRI.getType(LHS);
+
+ // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+ // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+ Register LHSSub = MIRBuilder.buildSub(Ty, LHS, RHS).getReg(0);
+ Register RHSSub = MIRBuilder.buildSub(Ty, RHS, LHS).getReg(0);
+ CmpInst::Predicate Pred = (MI.getOpcode() == TargetOpcode::G_ABDS)
+ ? CmpInst::ICMP_SGT
+ : CmpInst::ICMP_UGT;
+ auto ICmp = MIRBuilder.buildICmp(Pred, LLT::scalar(1), LHS, RHS);
+ MIRBuilder.buildSelect(DstReg, ICmp, LHSSub, RHSSub);
+
+ MI.eraseFromParent();
+ return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAbsDiffToMinMax(MachineInstr &MI) {
+ assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
+ MI.getOpcode() == TargetOpcode::G_ABDU) &&
+ "Expected G_ABDS or G_ABDU instruction");
+
+ auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
+ LLT Ty = MRI.getType(LHS);
+
+ // abds(lhs, rhs) -→ sub(smax(lhs, rhs), smin(lhs, rhs))
+ // abdu(lhs, rhs) -→ sub(umax(lhs, rhs), umin(lhs, rhs))
+ Register MaxReg, MinReg;
+ if (MI.getOpcode() == TargetOpcode::G_ABDS) {
+ MaxReg = MIRBuilder.buildSMax(Ty, LHS, RHS).getReg(0);
+ MinReg = MIRBuilder.buildSMin(Ty, LHS, RHS).getReg(0);
+ } else {
+ MaxReg = MIRBuilder.buildUMax(Ty, LHS, RHS).getReg(0);
+ MinReg = MIRBuilder.buildUMin(Ty, LHS, RHS).getReg(0);
+ }
+ MIRBuilder.buildSub(DstReg, MaxReg, MinReg);
+
+ MI.eraseFromParent();
+ return Legalized;
+}
+
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
Register SrcReg = MI.getOperand(1).getReg();
Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index e41fd81953f4..58d631e569b3 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -466,8 +466,14 @@ llvm::getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI) {
std::optional<DefinitionAndSourceRegister>
llvm::getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) {
Register DefSrcReg = Reg;
- auto *DefMI = MRI.getVRegDef(Reg);
- auto DstTy = MRI.getType(DefMI->getOperand(0).getReg());
+ // This assumes that the code is in SSA form, so there should only be one
+ // definition.
+ auto DefIt = MRI.def_begin(Reg);
+ if (DefIt == MRI.def_end())
+ return {};
+ MachineOperand &DefOpnd = *DefIt;
+ MachineInstr *DefMI = DefOpnd.getParent();
+ auto DstTy = MRI.getType(DefOpnd.getReg());
if (!DstTy.isValid())
return std::nullopt;
unsigned Opc = DefMI->getOpcode();
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 93f6e39b56ab..e3ded12a1847 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -537,28 +537,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
"number of stored element should be a multiple of Factor");
Value *Mask = nullptr;
+ auto GapMask = APInt::getAllOnes(Factor);
if (SI) {
LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
} else {
// Check mask operand. Handle both all-true/false and interleaved mask.
unsigned LaneMaskLen = NumStoredElements / Factor;
- APInt GapMask(Factor, 0);
std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor,
ElementCount::getFixed(LaneMaskLen));
if (!Mask)
return false;
- // We haven't supported gap mask for stores. Yet it is possible that we
- // already changed the IR, hence returning true here.
- if (GapMask.popcount() != Factor)
- return true;
LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
<< *Store << "\n");
+ LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
+ << " and actual factor " << GapMask.popcount() << "\n");
}
// Try to create target specific intrinsics to replace the store and
// shuffle.
- if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor))
+ if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor, GapMask))
return false;
// Already have a new target specific interleaved store. Erase the old store.
@@ -662,6 +660,10 @@ static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
}
if (auto *SVI = dyn_cast<ShuffleVectorInst>(WideMask)) {
+ Type *Op1Ty = SVI->getOperand(1)->getType();
+ if (!isa<FixedVectorType>(Op1Ty))
+ return {nullptr, GapMask};
+
// Check that the shuffle mask is: a) an interleave, b) all of the same
// set of the elements, and c) contained by the first source. (c) could
// be relaxed if desired.
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index f12f437c493e..9d98e6c085fe 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -536,12 +536,6 @@ public:
namespace llvm {
-/// Implementation of the LiveDebugVariables pass.
-
-LiveDebugVariables::LiveDebugVariables() = default;
-LiveDebugVariables::~LiveDebugVariables() = default;
-LiveDebugVariables::LiveDebugVariables(LiveDebugVariables &&) = default;
-
class LiveDebugVariables::LDVImpl {
LocMap::Allocator allocator;
MachineFunction *MF = nullptr;
@@ -683,6 +677,12 @@ public:
void print(raw_ostream&);
};
+/// Implementation of the LiveDebugVariables pass.
+
+LiveDebugVariables::LiveDebugVariables() = default;
+LiveDebugVariables::~LiveDebugVariables() = default;
+LiveDebugVariables::LiveDebugVariables(LiveDebugVariables &&) = default;
+
} // namespace llvm
static void printDebugLoc(const DebugLoc &DL, raw_ostream &CommentOS,
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
index 116a919585d7..17a7f48e3f2e 100644
--- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -21,6 +21,10 @@ using namespace llvm;
AnalysisKey MachineFunctionAnalysis::Key;
+llvm::MachineFunctionAnalysis::Result::Result(
+ std::unique_ptr<MachineFunction> MF)
+ : MF(std::move(MF)) {}
+
bool MachineFunctionAnalysis::Result::invalidate(
Function &, const PreservedAnalyses &PA,
FunctionAnalysisManager::Invalidator &) {
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index d9e8484c08d7..da29ffc9d2fe 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -133,7 +133,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
SmallSetVector<Register, 32> LocalDefs;
BitVector LocalDefsP(TRI->getNumRegUnits());
SmallSet<Register, 8> DeadDefSet;
- SmallSet<Register, 16> KilledDefSet;
SmallSetVector<Register, 8> ExternUses;
SmallSet<Register, 8> KilledUseSet;
SmallSet<Register, 8> UndefUseSet;
@@ -151,7 +150,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
MO.setIsInternalRead();
if (MO.isKill()) {
// Internal def is now killed.
- KilledDefSet.insert(Reg);
+ DeadDefSet.insert(Reg);
}
} else {
if (ExternUses.insert(Reg)) {
@@ -171,21 +170,18 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
continue;
if (LocalDefs.insert(Reg)) {
- if (MO.isDead())
- DeadDefSet.insert(Reg);
+ if (!MO.isDead() && Reg.isPhysical()) {
+ for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+ LocalDefsP.set(Unit);
+ }
} else {
- // Re-defined inside the bundle, it's no longer killed.
- KilledDefSet.erase(Reg);
if (!MO.isDead()) {
- // Previously defined but dead.
+ // Re-defined inside the bundle, it's no longer dead.
DeadDefSet.erase(Reg);
}
}
-
- if (!MO.isDead() && Reg.isPhysical()) {
- for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
- LocalDefsP.set(Unit);
- }
+ if (MO.isDead())
+ DeadDefSet.insert(Reg);
}
// Set FrameSetup/FrameDestroy for the bundle. If any of the instructions
@@ -198,7 +194,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
for (Register Reg : LocalDefs) {
// If it's not live beyond end of the bundle, mark it dead.
- bool isDead = DeadDefSet.contains(Reg) || KilledDefSet.contains(Reg);
+ bool isDead = DeadDefSet.contains(Reg);
MIB.addReg(Reg, getDefRegState(true) | getDeadRegState(isDead) |
getImplRegState(true));
}
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index b0bce2c21a47..fdae3b470de0 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -59,8 +59,10 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/CGData/CodeGenDataReader.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
@@ -107,6 +109,16 @@ STATISTIC(StableHashAttempts,
STATISTIC(StableHashDropped,
"Count of unsuccessful hashing attempts for outlined functions");
STATISTIC(NumRemovedLOHs, "Total number of Linker Optimization Hints removed");
+STATISTIC(NumPGOBlockedOutlined,
+ "Number of times outlining was blocked by PGO");
+STATISTIC(NumPGOAllowedCold,
+ "Number of times outlining was allowed from cold functions");
+STATISTIC(NumPGOConservativeBlockedOutlined,
+ "Number of times outlining was blocked conservatively when profile "
+ "counts were missing");
+STATISTIC(NumPGOOptimisticOutlined,
+ "Number of times outlining was allowed optimistically when profile "
+ "counts were missing");
// Set to true if the user wants the outliner to run on linkonceodr linkage
// functions. This is false by default because the linker can dedupe linkonceodr
@@ -438,11 +450,10 @@ struct MachineOutliner : public ModulePass {
/// The current repeat number of machine outlining.
unsigned OutlineRepeatedNum = 0;
- /// Set to true if the outliner should run on all functions in the module
- /// considered safe for outlining.
- /// Set to true by default for compatibility with llc's -run-pass option.
- /// Set when the pass is constructed in TargetPassConfig.
- bool RunOnAllFunctions = true;
+ /// The mode for whether to run the outliner
+ /// Set to always-outline by default for compatibility with llc's -run-pass
+ /// option.
+ RunOutliner RunOutlinerMode = RunOutliner::AlwaysOutline;
/// This is a compact representation of hash sequences of outlined functions.
/// It is used when OutlinerMode = CGDataMode::Write.
@@ -468,6 +479,11 @@ struct MachineOutliner : public ModulePass {
AU.addRequired<TargetPassConfig>();
AU.addPreserved<MachineModuleInfoWrapperPass>();
AU.addUsedIfAvailable<ImmutableModuleSummaryIndexWrapperPass>();
+ if (RunOutlinerMode == RunOutliner::OptimisticPGO ||
+ RunOutlinerMode == RunOutliner::ConservativePGO) {
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ }
AU.setPreservesAll();
ModulePass::getAnalysisUsage(AU);
}
@@ -578,9 +594,9 @@ struct MachineOutliner : public ModulePass {
char MachineOutliner::ID = 0;
namespace llvm {
-ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions) {
+ModulePass *createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
MachineOutliner *OL = new MachineOutliner();
- OL->RunOnAllFunctions = RunOnAllFunctions;
+ OL->RunOutlinerMode = RunOutlinerMode;
return OL;
}
@@ -1017,9 +1033,6 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
/* Outlined code is optimized code by definition. */
DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
- // Don't add any new variables to the subprogram.
- DB.finalizeSubprogram(OutlinedSP);
-
// Attach subprogram to the function.
F->setSubprogram(OutlinedSP);
// We're done with the DIBuilder.
@@ -1201,10 +1214,49 @@ bool MachineOutliner::outline(
return OutlinedSomething;
}
+static bool allowPGOOutlining(RunOutliner RunOutlinerMode,
+ const ProfileSummaryInfo *PSI,
+ const BlockFrequencyInfo *BFI,
+ MachineBasicBlock &MBB) {
+ if (RunOutlinerMode != RunOutliner::OptimisticPGO &&
+ RunOutlinerMode != RunOutliner::ConservativePGO)
+ return true;
+ auto *MF = MBB.getParent();
+ if (MF->getFunction().hasFnAttribute(Attribute::Cold)) {
+ ++NumPGOAllowedCold;
+ return true;
+ }
+
+ auto *BB = MBB.getBasicBlock();
+ if (BB && PSI && BFI)
+ if (auto Count = BFI->getBlockProfileCount(BB))
+ return *Count <= PSI->getOrCompColdCountThreshold();
+
+ if (RunOutlinerMode == RunOutliner::OptimisticPGO) {
+ auto *TII = MF->getSubtarget().getInstrInfo();
+ if (TII->shouldOutlineFromFunctionByDefault(*MF)) {
+ // Profile data is unavailable, but we optimistically allow outlining
+ ++NumPGOOptimisticOutlined;
+ return true;
+ }
+ return false;
+ }
+ assert(RunOutlinerMode == RunOutliner::ConservativePGO);
+ // Profile data is unavailable, so we conservatively block outlining
+ ++NumPGOConservativeBlockedOutlined;
+ return false;
+}
+
void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) {
// Build instruction mappings for each function in the module. Start by
// iterating over each Function in M.
LLVM_DEBUG(dbgs() << "*** Populating mapper ***\n");
+ bool EnableProfileGuidedOutlining =
+ RunOutlinerMode == RunOutliner::OptimisticPGO ||
+ RunOutlinerMode == RunOutliner::ConservativePGO;
+ ProfileSummaryInfo *PSI = nullptr;
+ if (EnableProfileGuidedOutlining)
+ PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
for (Function &F : M) {
LLVM_DEBUG(dbgs() << "MAPPING FUNCTION: " << F.getName() << "\n");
@@ -1225,7 +1277,11 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) {
}
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
- if (!RunOnAllFunctions && !TII->shouldOutlineFromFunctionByDefault(*MF)) {
+ BlockFrequencyInfo *BFI = nullptr;
+ if (EnableProfileGuidedOutlining && F.hasProfileData())
+ BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+ if (RunOutlinerMode == RunOutliner::TargetDefault &&
+ !TII->shouldOutlineFromFunctionByDefault(*MF)) {
LLVM_DEBUG(dbgs() << "SKIP: Target does not want to outline from "
"function by default\n");
continue;
@@ -1265,6 +1321,11 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) {
continue;
}
+ if (!allowPGOOutlining(RunOutlinerMode, PSI, BFI, MBB)) {
+ ++NumPGOBlockedOutlined;
+ continue;
+ }
+
// MBB is suitable for outlining. Map it to a list of unsigneds.
Mapper.convertToUnsignedVec(MBB, *TII);
}
@@ -1437,10 +1498,22 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
// the user how the outliner is running.
LLVM_DEBUG({
dbgs() << "Machine Outliner: Running on ";
- if (RunOnAllFunctions)
+ switch (RunOutlinerMode) {
+ case RunOutliner::AlwaysOutline:
dbgs() << "all functions";
- else
+ break;
+ case RunOutliner::OptimisticPGO:
+ dbgs() << "optimistically cold functions";
+ break;
+ case RunOutliner::ConservativePGO:
+ dbgs() << "conservatively cold functions";
+ break;
+ case RunOutliner::TargetDefault:
dbgs() << "target-default functions";
+ break;
+ case RunOutliner::NeverOutline:
+ llvm_unreachable("should not outline");
+ }
dbgs() << "\n";
});
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index b7135251781a..abb3f3e61200 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -432,6 +432,11 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
return hasSingleElement(use_nodbg_instructions(RegNo));
}
+MachineOperand *MachineRegisterInfo::getOneNonDBGUse(Register RegNo) const {
+ auto RegNoDbgUses = use_nodbg_operands(RegNo);
+ return hasSingleElement(RegNoDbgUses) ? &*RegNoDbgUses.begin() : nullptr;
+}
+
MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 96c9cde622b4..f54e2f264556 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -507,83 +507,86 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
});
break;
case Intrinsic::objc_autorelease:
- Changed |= lowerObjCCall(F, RTLIB::objc_autorelease);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_autorelease);
break;
case Intrinsic::objc_autoreleasePoolPop:
- Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPop);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleasePoolPop);
break;
case Intrinsic::objc_autoreleasePoolPush:
- Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPush);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleasePoolPush);
break;
case Intrinsic::objc_autoreleaseReturnValue:
- Changed |= lowerObjCCall(F, RTLIB::objc_autoreleaseReturnValue);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleaseReturnValue);
break;
case Intrinsic::objc_copyWeak:
- Changed |= lowerObjCCall(F, RTLIB::objc_copyWeak);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_copyWeak);
break;
case Intrinsic::objc_destroyWeak:
- Changed |= lowerObjCCall(F, RTLIB::objc_destroyWeak);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_destroyWeak);
break;
case Intrinsic::objc_initWeak:
- Changed |= lowerObjCCall(F, RTLIB::objc_initWeak);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_initWeak);
break;
case Intrinsic::objc_loadWeak:
- Changed |= lowerObjCCall(F, RTLIB::objc_loadWeak);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_loadWeak);
break;
case Intrinsic::objc_loadWeakRetained:
- Changed |= lowerObjCCall(F, RTLIB::objc_loadWeakRetained);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_loadWeakRetained);
break;
case Intrinsic::objc_moveWeak:
- Changed |= lowerObjCCall(F, RTLIB::objc_moveWeak);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_moveWeak);
break;
case Intrinsic::objc_release:
- Changed |= lowerObjCCall(F, RTLIB::objc_release, true);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_release, true);
break;
case Intrinsic::objc_retain:
- Changed |= lowerObjCCall(F, RTLIB::objc_retain, true);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_retain, true);
break;
case Intrinsic::objc_retainAutorelease:
- Changed |= lowerObjCCall(F, RTLIB::objc_retainAutorelease);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainAutorelease);
break;
case Intrinsic::objc_retainAutoreleaseReturnValue:
- Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleaseReturnValue);
+ Changed |=
+ lowerObjCCall(F, RTLIB::impl_objc_retainAutoreleaseReturnValue);
break;
case Intrinsic::objc_retainAutoreleasedReturnValue:
- Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleasedReturnValue);
+ Changed |=
+ lowerObjCCall(F, RTLIB::impl_objc_retainAutoreleasedReturnValue);
break;
case Intrinsic::objc_claimAutoreleasedReturnValue:
- Changed |= lowerObjCCall(F, RTLIB::objc_claimAutoreleasedReturnValue);
+ Changed |=
+ lowerObjCCall(F, RTLIB::impl_objc_claimAutoreleasedReturnValue);
break;
case Intrinsic::objc_retainBlock:
- Changed |= lowerObjCCall(F, RTLIB::objc_retainBlock);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainBlock);
break;
case Intrinsic::objc_storeStrong:
- Changed |= lowerObjCCall(F, RTLIB::objc_storeStrong);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_storeStrong);
break;
case Intrinsic::objc_storeWeak:
- Changed |= lowerObjCCall(F, RTLIB::objc_storeWeak);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_storeWeak);
break;
case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue:
Changed |=
- lowerObjCCall(F, RTLIB::objc_unsafeClaimAutoreleasedReturnValue);
+ lowerObjCCall(F, RTLIB::impl_objc_unsafeClaimAutoreleasedReturnValue);
break;
case Intrinsic::objc_retainedObject:
- Changed |= lowerObjCCall(F, RTLIB::objc_retainedObject);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainedObject);
break;
case Intrinsic::objc_unretainedObject:
- Changed |= lowerObjCCall(F, RTLIB::objc_unretainedObject);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_unretainedObject);
break;
case Intrinsic::objc_unretainedPointer:
- Changed |= lowerObjCCall(F, RTLIB::objc_unretainedPointer);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_unretainedPointer);
break;
case Intrinsic::objc_retain_autorelease:
- Changed |= lowerObjCCall(F, RTLIB::objc_retain_autorelease);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_retain_autorelease);
break;
case Intrinsic::objc_sync_enter:
- Changed |= lowerObjCCall(F, RTLIB::objc_sync_enter);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_sync_enter);
break;
case Intrinsic::objc_sync_exit:
- Changed |= lowerObjCCall(F, RTLIB::objc_sync_exit);
+ Changed |= lowerObjCCall(F, RTLIB::impl_objc_sync_exit);
break;
case Intrinsic::exp:
case Intrinsic::exp2:
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 415674231b5c..a589ef761dd7 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -275,7 +275,6 @@ void ReachingDefAnalysis::printAllReachingDefs(MachineFunction &MF) {
bool ReachingDefAnalysis::runOnMachineFunction(MachineFunction &mf) {
MF = &mf;
- TRI = MF->getSubtarget().getRegisterInfo();
const TargetSubtargetInfo &STI = MF->getSubtarget();
TRI = STI.getRegisterInfo();
TII = STI.getInstrInfo();
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 27b5a0d37b67..d130efe96b56 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4710,7 +4710,10 @@ template <class MatchContextClass> SDValue DAGCombiner::visitMUL(SDNode *N) {
if (SDValue LogBase2 = BuildLogBase2(N1, DL)) {
EVT ShiftVT = getShiftAmountTy(N0.getValueType());
SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
- return Matcher.getNode(ISD::SHL, DL, VT, N0, Trunc);
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap());
+ // TODO: Preserve setNoSignedWrap if LogBase2 isn't BitWidth - 1.
+ return Matcher.getNode(ISD::SHL, DL, VT, N0, Trunc, Flags);
}
}
@@ -9998,13 +10001,16 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
}
}
- // fold (not (neg x)) -> (add X, -1)
- // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if
- // Y is a constant or the subtract has a single use.
- if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB &&
- isNullConstant(N0.getOperand(0))) {
- return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1),
- DAG.getAllOnesConstant(DL, VT));
+ // fold (not (sub Y, X)) -> (add X, ~Y) if Y is a constant
+ if (N0.getOpcode() == ISD::SUB && isAllOnesConstant(N1)) {
+ SDValue Y = N0.getOperand(0);
+ SDValue X = N0.getOperand(1);
+
+ if (auto *YConst = dyn_cast<ConstantSDNode>(Y)) {
+ APInt NotYValue = ~YConst->getAPIntValue();
+ SDValue NotY = DAG.getConstant(NotYValue, DL, VT);
+ return DAG.getNode(ISD::ADD, DL, VT, X, NotY, N->getFlags());
+ }
}
// fold (not (add X, -1)) -> (neg X)
@@ -11089,38 +11095,43 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
}
}
- // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
- // (and (srl x, (sub c2, c1), MASK)
- if (N0.getOpcode() == ISD::SHL &&
- (N0.getOperand(1) == N1 || N0->hasOneUse()) &&
- TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
- auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
- ConstantSDNode *RHS) {
- const APInt &LHSC = LHS->getAPIntValue();
- const APInt &RHSC = RHS->getAPIntValue();
- return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
- LHSC.getZExtValue() <= RHSC.getZExtValue();
- };
- if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
- /*AllowUndefs*/ false,
- /*AllowTypeMismatch*/ true)) {
- SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
- SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
- SDValue Mask = DAG.getAllOnesConstant(DL, VT);
- Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
- Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
- SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
- return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
- }
- if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
- /*AllowUndefs*/ false,
- /*AllowTypeMismatch*/ true)) {
- SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
- SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
- SDValue Mask = DAG.getAllOnesConstant(DL, VT);
- Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
- SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
- return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+ if (N0.getOpcode() == ISD::SHL) {
+ // fold (srl (shl nuw x, c), c) -> x
+ if (N0.getOperand(1) == N1 && N0->getFlags().hasNoUnsignedWrap())
+ return N0.getOperand(0);
+
+ // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
+ // (and (srl x, (sub c2, c1), MASK)
+ if ((N0.getOperand(1) == N1 || N0->hasOneUse()) &&
+ TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
+ auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
+ ConstantSDNode *RHS) {
+ const APInt &LHSC = LHS->getAPIntValue();
+ const APInt &RHSC = RHS->getAPIntValue();
+ return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
+ LHSC.getZExtValue() <= RHSC.getZExtValue();
+ };
+ if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
+ /*AllowUndefs*/ false,
+ /*AllowTypeMismatch*/ true)) {
+ SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+ SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
+ SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+ Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
+ Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
+ SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
+ return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+ }
+ if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
+ /*AllowUndefs*/ false,
+ /*AllowTypeMismatch*/ true)) {
+ SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+ SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
+ SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+ Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
+ SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
+ return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+ }
}
}
@@ -15137,7 +15148,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
return foldedExt;
} else if (ISD::isNON_EXTLoad(N0.getNode()) &&
ISD::isUNINDEXEDLoad(N0.getNode()) &&
- TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+ TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
bool DoXform = true;
SmallVector<SDNode *, 4> SetCCs;
if (!N0.hasOneUse())
@@ -16309,7 +16320,15 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) {
SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
- return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+ SDNodeFlags Flags;
+ // Propagate nuw for sub.
+ if (N0->getOpcode() == ISD::SUB && N0->getFlags().hasNoUnsignedWrap() &&
+ DAG.MaskedValueIsZero(
+ N0->getOperand(0),
+ APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
+ VT.getScalarSizeInBits())))
+ Flags.setNoUnsignedWrap(true);
+ return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR, Flags);
}
}
break;
@@ -16788,6 +16807,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
// If we have frozen and unfrozen users of N0, update so everything uses N.
if (!N0.isUndef() && !N0.hasOneUse()) {
SDValue FrozenN0(N, 0);
+ // Unfreeze all uses of N to avoid double deleting N from the CSE map.
+ DAG.ReplaceAllUsesOfValueWith(FrozenN0, N0);
DAG.ReplaceAllUsesOfValueWith(N0, FrozenN0);
// ReplaceAllUsesOfValueWith will have also updated the use in N, thus
// creating a cycle in a DAG. Let's undo that by mutating the freeze.
@@ -19346,13 +19367,13 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
// MachineBasicBlock CFG, which is awkward.
// fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
- // on the target.
+ // on the target, also copy fast math flags.
if (N1.getOpcode() == ISD::SETCC &&
TLI.isOperationLegalOrCustom(ISD::BR_CC,
N1.getOperand(0).getValueType())) {
- return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
- Chain, N1.getOperand(2),
- N1.getOperand(0), N1.getOperand(1), N2);
+ return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, Chain,
+ N1.getOperand(2), N1.getOperand(0), N1.getOperand(1), N2,
+ N1->getFlags());
}
if (N1.hasOneUse()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 1a63518ab37a..861f76e93f2c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -238,7 +238,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
// Create the result registers for this node and add the result regs to
// the machine instruction.
- if (VRBase == 0) {
+ if (!VRBase) {
assert(RC && "Isn't a register operand!");
VRBase = MRI->createVirtualRegister(RC);
MIB.addReg(VRBase, RegState::Define);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 90d62e6da8e9..9e85f08abb76 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -324,6 +324,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
Res = PromoteIntRes_VP_REDUCE(N);
break;
+ case ISD::LOOP_DEPENDENCE_WAR_MASK:
+ case ISD::LOOP_DEPENDENCE_RAW_MASK:
+ Res = PromoteIntRes_LOOP_DEPENDENCE_MASK(N);
+ break;
+
case ISD::FREEZE:
Res = PromoteIntRes_FREEZE(N);
break;
@@ -374,6 +379,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N,
return GetPromotedInteger(Op);
}
+SDValue DAGTypeLegalizer::PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
+ EVT VT = N->getValueType(0);
+ EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ return DAG.getNode(N->getOpcode(), SDLoc(N), NewVT, N->ops());
+}
+
SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
// Sign-extend the new bits, and continue the assertion.
SDValue Op = SExtPromotedInteger(N->getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 65fd863e55ac..586c3411791f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -382,6 +382,7 @@ private:
SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N);
SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N);
+ SDValue PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N);
// Integer Operand Promotion.
bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -436,6 +437,7 @@ private:
SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N);
SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N);
+ SDValue PromoteIntOp_LOOP_DEPENDENCE_MASK(SDNode *N, unsigned OpNo);
void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS);
void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
@@ -868,6 +870,7 @@ private:
// Vector Result Scalarization: <1 x ty> -> ty.
void ScalarizeVectorResult(SDNode *N, unsigned ResNo);
SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+ SDValue ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N);
SDValue ScalarizeVecRes_BinOp(SDNode *N);
SDValue ScalarizeVecRes_CMP(SDNode *N);
SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
@@ -964,6 +967,7 @@ private:
void SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -1070,6 +1074,7 @@ private:
SDValue WidenVecRes_ADDRSPACECAST(SDNode *N);
SDValue WidenVecRes_AssertZext(SDNode* N);
SDValue WidenVecRes_BITCAST(SDNode* N);
+ SDValue WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N);
SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 2ca98958fde0..8e423c4f83b3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -138,6 +138,7 @@ class VectorLegalizer {
SDValue ExpandVP_FNEG(SDNode *Node);
SDValue ExpandVP_FABS(SDNode *Node);
SDValue ExpandVP_FCOPYSIGN(SDNode *Node);
+ SDValue ExpandLOOP_DEPENDENCE_MASK(SDNode *N);
SDValue ExpandSELECT(SDNode *Node);
std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
SDValue ExpandStore(SDNode *N);
@@ -475,6 +476,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::VECTOR_COMPRESS:
case ISD::SCMP:
case ISD::UCMP:
+ case ISD::LOOP_DEPENDENCE_WAR_MASK:
+ case ISD::LOOP_DEPENDENCE_RAW_MASK:
Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
break;
case ISD::SMULFIX:
@@ -1291,6 +1294,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
case ISD::UCMP:
Results.push_back(TLI.expandCMP(Node, DAG));
return;
+ case ISD::LOOP_DEPENDENCE_WAR_MASK:
+ case ISD::LOOP_DEPENDENCE_RAW_MASK:
+ Results.push_back(ExpandLOOP_DEPENDENCE_MASK(Node));
+ return;
case ISD::FADD:
case ISD::FMUL:
@@ -1796,6 +1803,50 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign);
}
+SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) {
+ SDLoc DL(N);
+ SDValue SourceValue = N->getOperand(0);
+ SDValue SinkValue = N->getOperand(1);
+ SDValue EltSize = N->getOperand(2);
+
+ bool IsReadAfterWrite = N->getOpcode() == ISD::LOOP_DEPENDENCE_RAW_MASK;
+ EVT VT = N->getValueType(0);
+ EVT PtrVT = SourceValue->getValueType(0);
+
+ SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue);
+ if (IsReadAfterWrite)
+ Diff = DAG.getNode(ISD::ABS, DL, PtrVT, Diff);
+
+ Diff = DAG.getNode(ISD::SDIV, DL, PtrVT, Diff, EltSize);
+
+ // If the difference is positive then some elements may alias
+ EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+ Diff.getValueType());
+ SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT);
+ SDValue Cmp = DAG.getSetCC(DL, CmpVT, Diff, Zero,
+ IsReadAfterWrite ? ISD::SETEQ : ISD::SETLE);
+
+ // Create the lane mask
+ EVT SplatVT = VT.changeElementType(PtrVT);
+ SDValue DiffSplat = DAG.getSplat(SplatVT, DL, Diff);
+ SDValue VectorStep = DAG.getStepVector(DL, SplatVT);
+ EVT MaskVT = VT.changeElementType(MVT::i1);
+ SDValue DiffMask =
+ DAG.getSetCC(DL, MaskVT, VectorStep, DiffSplat, ISD::CondCode::SETULT);
+
+ EVT EltVT = VT.getVectorElementType();
+ // Extend the diff setcc in case the intrinsic has been promoted to a vector
+ // type with elements larger than i1
+ if (EltVT.getScalarSizeInBits() > MaskVT.getScalarSizeInBits())
+ DiffMask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, DiffMask);
+
+ // Splat the compare result then OR it with the lane mask
+ if (CmpVT.getScalarSizeInBits() < EltVT.getScalarSizeInBits())
+ Cmp = DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Cmp);
+ SDValue Splat = DAG.getSplat(VT, DL, Cmp);
+ return DAG.getNode(ISD::OR, DL, VT, DiffMask, Splat);
+}
+
void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
SmallVectorImpl<SDValue> &Results) {
// Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 10e3a5149a5d..118fd8418f78 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -53,6 +53,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
report_fatal_error("Do not know how to scalarize the result of this "
"operator!\n");
+ case ISD::LOOP_DEPENDENCE_WAR_MASK:
+ case ISD::LOOP_DEPENDENCE_RAW_MASK:
+ R = ScalarizeVecRes_LOOP_DEPENDENCE_MASK(N);
+ break;
case ISD::MERGE_VALUES: R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
case ISD::BITCAST: R = ScalarizeVecRes_BITCAST(N); break;
case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break;
@@ -396,6 +400,22 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
return GetScalarizedVector(Op);
}
+SDValue DAGTypeLegalizer::ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
+ SDValue SourceValue = N->getOperand(0);
+ SDValue SinkValue = N->getOperand(1);
+ SDValue EltSize = N->getOperand(2);
+ EVT PtrVT = SourceValue->getValueType(0);
+ SDLoc DL(N);
+
+ SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue);
+ EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+ Diff.getValueType());
+ SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT);
+ return DAG.getNode(ISD::OR, DL, CmpVT,
+ DAG.getSetCC(DL, CmpVT, Diff, EltSize, ISD::SETGE),
+ DAG.getSetCC(DL, CmpVT, Diff, Zero, ISD::SETEQ));
+}
+
SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
SDValue Op = N->getOperand(0);
if (getTypeAction(Op.getValueType()) == TargetLowering::TypeScalarizeVector)
@@ -1159,6 +1179,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
report_fatal_error("Do not know how to split the result of this "
"operator!\n");
+ case ISD::LOOP_DEPENDENCE_RAW_MASK:
+ case ISD::LOOP_DEPENDENCE_WAR_MASK:
+ SplitVecRes_LOOP_DEPENDENCE_MASK(N, Lo, Hi);
+ break;
case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
case ISD::AssertZext: SplitVecRes_AssertZext(N, Lo, Hi); break;
case ISD::VSELECT:
@@ -1652,6 +1676,25 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
}
+void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
+ SDValue &Hi) {
+ SDLoc DL(N);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+ SDValue PtrA = N->getOperand(0);
+ SDValue PtrB = N->getOperand(1);
+ Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB, N->getOperand(2));
+
+ unsigned EltSize = N->getConstantOperandVal(2);
+ unsigned Offset = EltSize * HiVT.getVectorMinNumElements();
+ SDValue Addend = HiVT.isScalableVT()
+ ? DAG.getVScale(DL, MVT::i64, APInt(64, Offset))
+ : DAG.getConstant(Offset, DL, MVT::i64);
+
+ PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend);
+ Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2));
+}
+
void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
SDValue &Hi) {
EVT LoVT, HiVT;
@@ -2517,10 +2560,10 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
else
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, dl);
+ MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- N->getPointerInfo(), MachineMemOperand::MOLoad,
- LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
- N->getRanges());
+ N->getPointerInfo(), MMOFlags, LocationSize::beforeOrAfterPointer(),
+ Alignment, N->getAAInfo(), N->getRanges());
if (auto *MGT = dyn_cast<MaskedGatherSDNode>(N)) {
SDValue PassThru = MGT->getPassThru();
@@ -4321,10 +4364,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) {
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, DL);
SDValue Lo;
+ MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- N->getPointerInfo(), MachineMemOperand::MOStore,
- LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
- N->getRanges());
+ N->getPointerInfo(), MMOFlags, LocationSize::beforeOrAfterPointer(),
+ Alignment, N->getAAInfo(), N->getRanges());
if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) {
SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale};
@@ -4784,6 +4827,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
#endif
report_fatal_error("Do not know how to widen the result of this operator!");
+ case ISD::LOOP_DEPENDENCE_RAW_MASK:
+ case ISD::LOOP_DEPENDENCE_WAR_MASK:
+ Res = WidenVecRes_LOOP_DEPENDENCE_MASK(N);
+ break;
case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
case ISD::ADDRSPACECAST:
Res = WidenVecRes_ADDRSPACECAST(N);
@@ -5986,6 +6033,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
return CreateStackStoreLoad(InOp, WidenVT);
}
+SDValue DAGTypeLegalizer::WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
+ return DAG.getNode(
+ N->getOpcode(), SDLoc(N),
+ TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)),
+ N->getOperand(0), N->getOperand(1), N->getOperand(2));
+}
+
SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
SDLoc dl(N);
// Build a vector with undefined for the new nodes.
diff --git a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 0a449fd011e6..72ea0898f975 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -63,6 +63,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
HorizontalVerticalBalance = 0;
}
+ResourcePriorityQueue::~ResourcePriorityQueue() = default;
+
unsigned
ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) {
unsigned NumberDeps = 0;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3672a91e33a3..bcf25958d098 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3299,7 +3299,7 @@ SelectionDAG::getValidShiftAmountRange(SDValue V, const APInt &DemandedElts,
return std::nullopt;
}
-std::optional<uint64_t>
+std::optional<unsigned>
SelectionDAG::getValidShiftAmount(SDValue V, const APInt &DemandedElts,
unsigned Depth) const {
assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
@@ -3312,7 +3312,7 @@ SelectionDAG::getValidShiftAmount(SDValue V, const APInt &DemandedElts,
return std::nullopt;
}
-std::optional<uint64_t>
+std::optional<unsigned>
SelectionDAG::getValidShiftAmount(SDValue V, unsigned Depth) const {
EVT VT = V.getValueType();
APInt DemandedElts = VT.isFixedLengthVector()
@@ -3321,7 +3321,7 @@ SelectionDAG::getValidShiftAmount(SDValue V, unsigned Depth) const {
return getValidShiftAmount(V, DemandedElts, Depth);
}
-std::optional<uint64_t>
+std::optional<unsigned>
SelectionDAG::getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts,
unsigned Depth) const {
assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
@@ -3333,7 +3333,7 @@ SelectionDAG::getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts,
return std::nullopt;
}
-std::optional<uint64_t>
+std::optional<unsigned>
SelectionDAG::getValidMinimumShiftAmount(SDValue V, unsigned Depth) const {
EVT VT = V.getValueType();
APInt DemandedElts = VT.isFixedLengthVector()
@@ -3342,7 +3342,7 @@ SelectionDAG::getValidMinimumShiftAmount(SDValue V, unsigned Depth) const {
return getValidMinimumShiftAmount(V, DemandedElts, Depth);
}
-std::optional<uint64_t>
+std::optional<unsigned>
SelectionDAG::getValidMaximumShiftAmount(SDValue V, const APInt &DemandedElts,
unsigned Depth) const {
assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
@@ -3354,7 +3354,7 @@ SelectionDAG::getValidMaximumShiftAmount(SDValue V, const APInt &DemandedElts,
return std::nullopt;
}
-std::optional<uint64_t>
+std::optional<unsigned>
SelectionDAG::getValidMaximumShiftAmount(SDValue V, unsigned Depth) const {
EVT VT = V.getValueType();
APInt DemandedElts = VT.isFixedLengthVector()
@@ -3828,7 +3828,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
Known = KnownBits::shl(Known, Known2, NUW, NSW, ShAmtNonZero);
// Minimum shift low bits are known zero.
- if (std::optional<uint64_t> ShMinAmt =
+ if (std::optional<unsigned> ShMinAmt =
getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1))
Known.Zero.setLowBits(*ShMinAmt);
break;
@@ -3840,7 +3840,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
Op->getFlags().hasExact());
// Minimum shift high bits are known zero.
- if (std::optional<uint64_t> ShMinAmt =
+ if (std::optional<unsigned> ShMinAmt =
getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1))
Known.Zero.setHighBits(*ShMinAmt);
break;
@@ -3850,6 +3850,22 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
Known = KnownBits::ashr(Known, Known2, /*ShAmtNonZero=*/false,
Op->getFlags().hasExact());
break;
+ case ISD::ROTL:
+ case ISD::ROTR:
+ if (ConstantSDNode *C =
+ isConstOrConstSplat(Op.getOperand(1), DemandedElts)) {
+ unsigned Amt = C->getAPIntValue().urem(BitWidth);
+
+ Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ // Canonicalize to ROTR.
+ if (Opcode == ISD::ROTL && Amt != 0)
+ Amt = BitWidth - Amt;
+
+ Known.Zero = Known.Zero.rotr(Amt);
+ Known.One = Known.One.rotr(Amt);
+ }
+ break;
case ISD::FSHL:
case ISD::FSHR:
if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(2), DemandedElts)) {
@@ -3868,15 +3884,11 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
if (Opcode == ISD::FSHL) {
- Known.One <<= Amt;
- Known.Zero <<= Amt;
- Known2.One.lshrInPlace(BitWidth - Amt);
- Known2.Zero.lshrInPlace(BitWidth - Amt);
+ Known <<= Amt;
+ Known2 >>= BitWidth - Amt;
} else {
- Known.One <<= BitWidth - Amt;
- Known.Zero <<= BitWidth - Amt;
- Known2.One.lshrInPlace(Amt);
- Known2.Zero.lshrInPlace(Amt);
+ Known <<= BitWidth - Amt;
+ Known2 >>= Amt;
}
Known = Known.unionWith(Known2);
}
@@ -4875,15 +4887,15 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
case ISD::SRA:
Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
// SRA X, C -> adds C sign bits.
- if (std::optional<uint64_t> ShAmt =
+ if (std::optional<unsigned> ShAmt =
getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1))
- Tmp = std::min<uint64_t>(Tmp + *ShAmt, VTBits);
+ Tmp = std::min(Tmp + *ShAmt, VTBits);
return Tmp;
case ISD::SHL:
if (std::optional<ConstantRange> ShAmtRange =
getValidShiftAmountRange(Op, DemandedElts, Depth + 1)) {
- uint64_t MaxShAmt = ShAmtRange->getUnsignedMax().getZExtValue();
- uint64_t MinShAmt = ShAmtRange->getUnsignedMin().getZExtValue();
+ unsigned MaxShAmt = ShAmtRange->getUnsignedMax().getZExtValue();
+ unsigned MinShAmt = ShAmtRange->getUnsignedMin().getZExtValue();
// Try to look through ZERO/SIGN/ANY_EXTEND. If all extended bits are
// shifted out, then we can compute the number of sign bits for the
// operand being extended. A future improvement could be to pass along the
@@ -4894,7 +4906,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
EVT ExtVT = Ext.getValueType();
SDValue Extendee = Ext.getOperand(0);
EVT ExtendeeVT = Extendee.getValueType();
- uint64_t SizeDifference =
+ unsigned SizeDifference =
ExtVT.getScalarSizeInBits() - ExtendeeVT.getScalarSizeInBits();
if (SizeDifference <= MinShAmt) {
Tmp = SizeDifference +
@@ -5127,7 +5139,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
// If the sign portion ends in our element the subtraction gives correct
// result. Otherwise it gives either negative or > bitwidth result
- return std::clamp(KnownSign - rIndex * BitWidth, 0, BitWidth);
+ return std::clamp(KnownSign - rIndex * BitWidth, 1, BitWidth);
}
case ISD::INSERT_VECTOR_ELT: {
if (VT.isScalableVector())
@@ -5660,6 +5672,10 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::USUBSAT:
case ISD::MULHU:
case ISD::MULHS:
+ case ISD::AVGFLOORS:
+ case ISD::AVGFLOORU:
+ case ISD::AVGCEILS:
+ case ISD::AVGCEILU:
case ISD::ABDU:
case ISD::ABDS:
case ISD::SMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 62ba801f6992..430e47451fd4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7974,12 +7974,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
}
case Intrinsic::amdgcn_call_whole_wave: {
TargetLowering::ArgListTy Args;
+ bool isTailCall = I.isTailCall();
// The first argument is the callee. Skip it when assembling the call args.
for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
TargetLowering::ArgListEntry Arg(getValue(I.getArgOperand(Idx)),
I.getArgOperand(Idx)->getType());
Arg.setAttributes(&I, Idx);
+
+ // If we have an explicit sret argument that is an Instruction, (i.e., it
+ // might point to function-local memory), we can't meaningfully tail-call.
+ if (Arg.IsSRet && isa<Instruction>(I.getArgOperand(Idx)))
+ isTailCall = false;
+
Args.push_back(Arg);
}
@@ -7994,7 +8001,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
.setChain(getRoot())
.setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
getValue(I.getArgOperand(0)), std::move(Args))
- .setTailCall(false)
+ .setTailCall(isTailCall && canTailCall(I))
.setIsPreallocated(
I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
.setConvergent(I.isConvergent())
@@ -8295,6 +8302,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
visitVectorExtractLastActive(I, Intrinsic);
return;
}
+ case Intrinsic::loop_dependence_war_mask:
+ setValue(&I,
+ DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, sdl,
+ EVT::getEVT(I.getType()), getValue(I.getOperand(0)),
+ getValue(I.getOperand(1)), getValue(I.getOperand(2))));
+ return;
+ case Intrinsic::loop_dependence_raw_mask:
+ setValue(&I,
+ DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, sdl,
+ EVT::getEVT(I.getType()), getValue(I.getOperand(0)),
+ getValue(I.getOperand(1)), getValue(I.getOperand(2))));
+ return;
}
}
@@ -8456,8 +8475,11 @@ void SelectionDAGBuilder::visitVPLoad(
MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML);
SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MachineMemOperand::Flags MMOFlags =
+ TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+ MachinePointerInfo(PtrOperand), MMOFlags,
LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2],
MMO, false /*IsExpanding */);
@@ -8508,9 +8530,11 @@ void SelectionDAGBuilder::visitVPGather(
Alignment = DAG.getEVTAlign(VT.getScalarType());
unsigned AS =
PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
+ MachineMemOperand::Flags MMOFlags =
+ TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo(AS), MachineMemOperand::MOLoad,
- LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
+ MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(),
+ *Alignment, AAInfo, Ranges);
SDValue Base, Index, Scale;
bool UniformBase =
getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(),
@@ -8546,8 +8570,11 @@ void SelectionDAGBuilder::visitVPStore(
Alignment = DAG.getEVTAlign(VT);
SDValue Ptr = OpValues[1];
SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MachineMemOperand::Flags MMOFlags =
+ TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+ MachinePointerInfo(PtrOperand), MMOFlags,
LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset,
OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED,
@@ -8569,9 +8596,11 @@ void SelectionDAGBuilder::visitVPScatter(
Alignment = DAG.getEVTAlign(VT.getScalarType());
unsigned AS =
PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
+ MachineMemOperand::Flags MMOFlags =
+ TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo(AS), MachineMemOperand::MOStore,
- LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
+ MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(),
+ *Alignment, AAInfo);
SDValue Base, Index, Scale;
bool UniformBase =
getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(),
@@ -8609,9 +8638,12 @@ void SelectionDAGBuilder::visitVPStridedLoad(
bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML);
SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MachineMemOperand::Flags MMOFlags =
+ TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo(AS), MachineMemOperand::MOLoad,
- LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
+ MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(),
+ *Alignment, AAInfo, Ranges);
SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1],
OpValues[2], OpValues[3], MMO,
@@ -8632,9 +8664,12 @@ void SelectionDAGBuilder::visitVPStridedStore(
Alignment = DAG.getEVTAlign(VT.getScalarType());
AAMDNodes AAInfo = VPIntrin.getAAMetadata();
unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MachineMemOperand::Flags MMOFlags =
+ TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo(AS), MachineMemOperand::MOStore,
- LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
+ MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(),
+ *Alignment, AAInfo);
SDValue ST = DAG.getStridedStoreVP(
getMemoryRoot(), DL, OpValues[0], OpValues[1],
@@ -8901,6 +8936,29 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
return Result;
}
+bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const {
+ bool isMustTailCall = CB.isMustTailCall();
+
+ // Avoid emitting tail calls in functions with the disable-tail-calls
+ // attribute.
+ const Function *Caller = CB.getParent()->getParent();
+ if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
+ "true" &&
+ !isMustTailCall)
+ return false;
+
+ // We can't tail call inside a function with a swifterror argument. Lowering
+ // does not support this yet. It would have to move into the swifterror
+ // register before the call.
+ if (DAG.getTargetLoweringInfo().supportSwiftError() &&
+ Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return false;
+
+ // Check if target-independent constraints permit a tail call here.
+ // Target-dependent constraints are checked within TLI->LowerCallTo.
+ return isInTailCallPosition(CB, DAG.getTarget());
+}
+
void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
bool isTailCall, bool isMustTailCall,
const BasicBlock *EHPadBB,
@@ -8915,21 +8973,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
const Value *SwiftErrorVal = nullptr;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (isTailCall) {
- // Avoid emitting tail calls in functions with the disable-tail-calls
- // attribute.
- auto *Caller = CB.getParent()->getParent();
- if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
- "true" && !isMustTailCall)
- isTailCall = false;
-
- // We can't tail call inside a function with a swifterror argument. Lowering
- // does not support this yet. It would have to move into the swifterror
- // register before the call.
- if (TLI.supportSwiftError() &&
- Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
- isTailCall = false;
- }
+ if (isTailCall)
+ isTailCall = canTailCall(CB);
for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
const Value *V = *I;
@@ -8969,11 +9014,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
Args.push_back(Entry);
}
- // Check if target-independent constraints permit a tail call here.
- // Target-dependent constraints are checked within TLI->LowerCallTo.
- if (isTailCall && !isInTailCallPosition(CB, DAG.getTarget()))
- isTailCall = false;
-
// Disable tail calls if there is an swifterror argument. Targets have not
// been updated to support tail calls.
if (TLI.supportSwiftError() && SwiftErrorVal)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index e0835e631035..c7577fa335fe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -408,6 +408,10 @@ public:
bool IsMustTailCall, const BasicBlock *EHPadBB = nullptr,
const TargetLowering::PtrAuthInfo *PAI = nullptr);
+ // Check some of the target-independent constraints for tail calls. This does
+ // not iterate over the call arguments.
+ bool canTailCall(const CallBase &CB) const;
+
// Lower range metadata from 0 to N to assert zext to an integer of nearest
// floor power of two.
SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 900da7645504..4b2a00c2e2cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -587,6 +587,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
return "partial_reduce_smla";
case ISD::PARTIAL_REDUCE_SUMLA:
return "partial_reduce_sumla";
+ case ISD::LOOP_DEPENDENCE_WAR_MASK:
+ return "loop_dep_war";
+ case ISD::LOOP_DEPENDENCE_RAW_MASK:
+ return "loop_dep_raw";
// Vector Predication
#define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index ece50ed95fc4..e61558c59bf0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1729,10 +1729,18 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
// Setup an EH landing-pad block.
FuncInfo->ExceptionPointerVirtReg = Register();
FuncInfo->ExceptionSelectorVirtReg = Register();
- if (LLVMBB->isEHPad())
+ if (LLVMBB->isEHPad()) {
if (!PrepareEHLandingPad())
continue;
+ if (!FastIS) {
+ SDValue NewRoot = TLI->lowerEHPadEntry(CurDAG->getRoot(),
+ SDB->getCurSDLoc(), *CurDAG);
+ if (NewRoot && NewRoot != CurDAG->getRoot())
+ CurDAG->setRoot(NewRoot);
+ }
+ }
+
// Before doing SelectionDAG ISel, see if FastISel has been requested.
if (FastIS) {
if (LLVMBB != &Fn.getEntryBlock())
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 402a012e8e55..fd6d20e146bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -832,7 +832,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
case ISD::SHL: {
// If we are only demanding sign bits then we can use the shift source
// directly.
- if (std::optional<uint64_t> MaxSA =
+ if (std::optional<unsigned> MaxSA =
DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
SDValue Op0 = Op.getOperand(0);
unsigned ShAmt = *MaxSA;
@@ -847,7 +847,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
case ISD::SRL: {
// If we are only demanding sign bits then we can use the shift source
// directly.
- if (std::optional<uint64_t> MaxSA =
+ if (std::optional<unsigned> MaxSA =
DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
SDValue Op0 = Op.getOperand(0);
unsigned ShAmt = *MaxSA;
@@ -1780,7 +1780,7 @@ bool TargetLowering::SimplifyDemandedBits(
SDValue Op1 = Op.getOperand(1);
EVT ShiftVT = Op1.getValueType();
- if (std::optional<uint64_t> KnownSA =
+ if (std::optional<unsigned> KnownSA =
TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) {
unsigned ShAmt = *KnownSA;
if (ShAmt == 0)
@@ -1792,7 +1792,7 @@ bool TargetLowering::SimplifyDemandedBits(
// TODO - support non-uniform vector amounts.
if (Op0.getOpcode() == ISD::SRL) {
if (!DemandedBits.intersects(APInt::getLowBitsSet(BitWidth, ShAmt))) {
- if (std::optional<uint64_t> InnerSA =
+ if (std::optional<unsigned> InnerSA =
TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) {
unsigned C1 = *InnerSA;
unsigned Opc = ISD::SHL;
@@ -1832,7 +1832,7 @@ bool TargetLowering::SimplifyDemandedBits(
// TODO - support non-uniform vector amounts.
if (InnerOp.getOpcode() == ISD::SRL && Op0.hasOneUse() &&
InnerOp.hasOneUse()) {
- if (std::optional<uint64_t> SA2 = TLO.DAG.getValidShiftAmount(
+ if (std::optional<unsigned> SA2 = TLO.DAG.getValidShiftAmount(
InnerOp, DemandedElts, Depth + 2)) {
unsigned InnerShAmt = *SA2;
if (InnerShAmt < ShAmt && InnerShAmt < InnerBits &&
@@ -1858,8 +1858,7 @@ bool TargetLowering::SimplifyDemandedBits(
Op->dropFlags(SDNodeFlags::NoWrap);
return true;
}
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
+ Known <<= ShAmt;
// low bits known zero.
Known.Zero.setLowBits(ShAmt);
@@ -1950,7 +1949,7 @@ bool TargetLowering::SimplifyDemandedBits(
// If we are only demanding sign bits then we can use the shift source
// directly.
- if (std::optional<uint64_t> MaxSA =
+ if (std::optional<unsigned> MaxSA =
TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
unsigned ShAmt = *MaxSA;
unsigned NumSignBits =
@@ -1966,7 +1965,7 @@ bool TargetLowering::SimplifyDemandedBits(
SDValue Op1 = Op.getOperand(1);
EVT ShiftVT = Op1.getValueType();
- if (std::optional<uint64_t> KnownSA =
+ if (std::optional<unsigned> KnownSA =
TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) {
unsigned ShAmt = *KnownSA;
if (ShAmt == 0)
@@ -1978,7 +1977,7 @@ bool TargetLowering::SimplifyDemandedBits(
// TODO - support non-uniform vector amounts.
if (Op0.getOpcode() == ISD::SHL) {
if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) {
- if (std::optional<uint64_t> InnerSA =
+ if (std::optional<unsigned> InnerSA =
TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) {
unsigned C1 = *InnerSA;
unsigned Opc = ISD::SRL;
@@ -1998,7 +1997,7 @@ bool TargetLowering::SimplifyDemandedBits(
// single sra. We can do this if the top bits are never demanded.
if (Op0.getOpcode() == ISD::SRA && Op0.hasOneUse()) {
if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) {
- if (std::optional<uint64_t> InnerSA =
+ if (std::optional<unsigned> InnerSA =
TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) {
unsigned C1 = *InnerSA;
// Clamp the combined shift amount if it exceeds the bit width.
@@ -2042,8 +2041,7 @@ bool TargetLowering::SimplifyDemandedBits(
if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
Depth + 1))
return true;
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// High bits known zero.
Known.Zero.setHighBits(ShAmt);
@@ -2064,7 +2062,7 @@ bool TargetLowering::SimplifyDemandedBits(
// If we are only demanding sign bits then we can use the shift source
// directly.
- if (std::optional<uint64_t> MaxSA =
+ if (std::optional<unsigned> MaxSA =
TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
unsigned ShAmt = *MaxSA;
// Must already be signbits in DemandedBits bounds, and can't demand any
@@ -2103,7 +2101,7 @@ bool TargetLowering::SimplifyDemandedBits(
if (DemandedBits.isOne())
return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
- if (std::optional<uint64_t> KnownSA =
+ if (std::optional<unsigned> KnownSA =
TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) {
unsigned ShAmt = *KnownSA;
if (ShAmt == 0)
@@ -2112,7 +2110,7 @@ bool TargetLowering::SimplifyDemandedBits(
// fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target
// supports sext_inreg.
if (Op0.getOpcode() == ISD::SHL) {
- if (std::optional<uint64_t> InnerSA =
+ if (std::optional<unsigned> InnerSA =
TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) {
unsigned LowBits = BitWidth - ShAmt;
EVT ExtVT = EVT::getIntegerVT(*TLO.DAG.getContext(), LowBits);
@@ -2153,8 +2151,7 @@ bool TargetLowering::SimplifyDemandedBits(
if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
Depth + 1))
return true;
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// If the input sign bit is known to be zero, or if none of the top bits
// are demanded, turn this into an unsigned shift right.
@@ -2225,10 +2222,8 @@ bool TargetLowering::SimplifyDemandedBits(
Depth + 1))
return true;
- Known2.One <<= (IsFSHL ? Amt : (BitWidth - Amt));
- Known2.Zero <<= (IsFSHL ? Amt : (BitWidth - Amt));
- Known.One.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
- Known.Zero.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
+ Known2 <<= (IsFSHL ? Amt : (BitWidth - Amt));
+ Known >>= (IsFSHL ? (BitWidth - Amt) : Amt);
Known = Known.unionWith(Known2);
// Attempt to avoid multi-use ops if we don't need anything from them.
@@ -2363,8 +2358,7 @@ bool TargetLowering::SimplifyDemandedBits(
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
Depth + 1))
return true;
- Known.One = Known2.One.reverseBits();
- Known.Zero = Known2.Zero.reverseBits();
+ Known = Known2.reverseBits();
break;
}
case ISD::BSWAP: {
@@ -2397,8 +2391,7 @@ bool TargetLowering::SimplifyDemandedBits(
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
Depth + 1))
return true;
- Known.One = Known2.One.byteSwap();
- Known.Zero = Known2.Zero.byteSwap();
+ Known = Known2.byteSwap();
break;
}
case ISD::CTPOP: {
@@ -2664,11 +2657,11 @@ bool TargetLowering::SimplifyDemandedBits(
break;
}
- std::optional<uint64_t> ShAmtC =
+ std::optional<unsigned> ShAmtC =
TLO.DAG.getValidShiftAmount(Src, DemandedElts, Depth + 2);
if (!ShAmtC || *ShAmtC >= BitWidth)
break;
- uint64_t ShVal = *ShAmtC;
+ unsigned ShVal = *ShAmtC;
APInt HighBits =
APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth);
@@ -3234,27 +3227,6 @@ bool TargetLowering::SimplifyDemandedVectorElts(
KnownUndef.setAllBits();
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
}
- SDValue ScalarSrc = Op.getOperand(0);
- if (ScalarSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
- SDValue Src = ScalarSrc.getOperand(0);
- SDValue Idx = ScalarSrc.getOperand(1);
- EVT SrcVT = Src.getValueType();
-
- ElementCount SrcEltCnt = SrcVT.getVectorElementCount();
-
- if (SrcEltCnt.isScalable())
- return false;
-
- unsigned NumSrcElts = SrcEltCnt.getFixedValue();
- if (isNullConstant(Idx)) {
- APInt SrcDemandedElts = APInt::getOneBitSet(NumSrcElts, 0);
- APInt SrcUndef = KnownUndef.zextOrTrunc(NumSrcElts);
- APInt SrcZero = KnownZero.zextOrTrunc(NumSrcElts);
- if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
- TLO, Depth + 1))
- return true;
- }
- }
KnownUndef.setHighBits(NumElts - 1);
break;
}
@@ -9740,8 +9712,8 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
SDLoc dl(N);
EVT VT = N->getValueType(0);
- SDValue LHS = DAG.getFreeze(N->getOperand(0));
- SDValue RHS = DAG.getFreeze(N->getOperand(1));
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
bool IsSigned = N->getOpcode() == ISD::ABDS;
// abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))
@@ -9749,34 +9721,37 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;
unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;
if (isOperationLegal(MaxOpc, VT) && isOperationLegal(MinOpc, VT)) {
+ LHS = DAG.getFreeze(LHS);
+ RHS = DAG.getFreeze(RHS);
SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);
SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);
return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
}
// abdu(lhs, rhs) -> or(usubsat(lhs,rhs), usubsat(rhs,lhs))
- if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT))
+ if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT)) {
+ LHS = DAG.getFreeze(LHS);
+ RHS = DAG.getFreeze(RHS);
return DAG.getNode(ISD::OR, dl, VT,
DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS),
DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS));
+ }
// If the subtract doesn't overflow then just use abs(sub())
- // NOTE: don't use frozen operands for value tracking.
- bool IsNonNegative = DAG.SignBitIsZero(N->getOperand(1)) &&
- DAG.SignBitIsZero(N->getOperand(0));
+ bool IsNonNegative = DAG.SignBitIsZero(LHS) && DAG.SignBitIsZero(RHS);
- if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, N->getOperand(0),
- N->getOperand(1)))
+ if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, LHS, RHS))
return DAG.getNode(ISD::ABS, dl, VT,
DAG.getNode(ISD::SUB, dl, VT, LHS, RHS));
- if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, N->getOperand(1),
- N->getOperand(0)))
+ if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, RHS, LHS))
return DAG.getNode(ISD::ABS, dl, VT,
DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
+ LHS = DAG.getFreeze(LHS);
+ RHS = DAG.getFreeze(RHS);
SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
// Branchless expansion iff cmp result is allbits:
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 9e49dddd46ba..0d7b128fc736 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -996,7 +996,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
(MI->getOpcode() != CombineOpc && CombineOpc != 0))
return false;
// Must only used by the user we combine with.
- if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+ if (!MRI.hasOneNonDBGUse(MO.getReg()))
return false;
return true;
@@ -1456,11 +1456,13 @@ void TargetInstrInfo::reassociateOps(
MIB1->clearFlag(MachineInstr::MIFlag::NoSWrap);
MIB1->clearFlag(MachineInstr::MIFlag::NoUWrap);
MIB1->clearFlag(MachineInstr::MIFlag::IsExact);
+ MIB1->clearFlag(MachineInstr::MIFlag::Disjoint);
MIB2->setFlags(IntersectedFlags);
MIB2->clearFlag(MachineInstr::MIFlag::NoSWrap);
MIB2->clearFlag(MachineInstr::MIFlag::NoUWrap);
MIB2->clearFlag(MachineInstr::MIFlag::IsExact);
+ MIB2->clearFlag(MachineInstr::MIFlag::Disjoint);
setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9ffced80b07f..c23281a820b2 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -612,23 +612,23 @@ RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate(
RTLIB::LibcallImpl Impl) const {
switch (Impl) {
- case RTLIB::__aeabi_dcmpeq__une:
- case RTLIB::__aeabi_fcmpeq__une:
+ case RTLIB::impl___aeabi_dcmpeq__une:
+ case RTLIB::impl___aeabi_fcmpeq__une:
// Usage in the eq case, so we have to invert the comparison.
return ISD::SETEQ;
- case RTLIB::__aeabi_dcmpeq__oeq:
- case RTLIB::__aeabi_fcmpeq__oeq:
+ case RTLIB::impl___aeabi_dcmpeq__oeq:
+ case RTLIB::impl___aeabi_fcmpeq__oeq:
// Normal comparison to boolean value.
return ISD::SETNE;
- case RTLIB::__aeabi_dcmplt:
- case RTLIB::__aeabi_dcmple:
- case RTLIB::__aeabi_dcmpge:
- case RTLIB::__aeabi_dcmpgt:
- case RTLIB::__aeabi_dcmpun:
- case RTLIB::__aeabi_fcmplt:
- case RTLIB::__aeabi_fcmple:
- case RTLIB::__aeabi_fcmpge:
- case RTLIB::__aeabi_fcmpgt:
+ case RTLIB::impl___aeabi_dcmplt:
+ case RTLIB::impl___aeabi_dcmple:
+ case RTLIB::impl___aeabi_dcmpge:
+ case RTLIB::impl___aeabi_dcmpgt:
+ case RTLIB::impl___aeabi_dcmpun:
+ case RTLIB::impl___aeabi_fcmplt:
+ case RTLIB::impl___aeabi_fcmple:
+ case RTLIB::impl___aeabi_fcmpge:
+ case RTLIB::impl___aeabi_fcmpgt:
/// The AEABI versions return a typical boolean value, so we can compare
/// against the integer result as simply != 0.
return ISD::SETNE;
@@ -900,6 +900,9 @@ void TargetLoweringBase::initActions() {
// Masked vector extracts default to expand.
setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Expand);
+ setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Expand);
+ setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Expand);
+
// FP environment operations default to expand.
setOperationAction(ISD::GET_FPENV, VT, Expand);
setOperationAction(ISD::SET_FPENV, VT, Expand);
@@ -2406,6 +2409,34 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
return Flags;
}
+MachineMemOperand::Flags TargetLoweringBase::getVPIntrinsicMemOperandFlags(
+ const VPIntrinsic &VPIntrin) const {
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
+ Intrinsic::ID IntrinID = VPIntrin.getIntrinsicID();
+
+ switch (IntrinID) {
+ default:
+ llvm_unreachable("unexpected intrinsic. Existing code may be appropriate "
+ "for it, but support must be explicitly enabled");
+ case Intrinsic::vp_load:
+ case Intrinsic::vp_gather:
+ case Intrinsic::experimental_vp_strided_load:
+ Flags = MachineMemOperand::MOLoad;
+ break;
+ case Intrinsic::vp_store:
+ case Intrinsic::vp_scatter:
+ case Intrinsic::experimental_vp_strided_store:
+ Flags = MachineMemOperand::MOStore;
+ break;
+ }
+
+ if (VPIntrin.hasMetadata(LLVMContext::MD_nontemporal))
+ Flags |= MachineMemOperand::MONonTemporal;
+
+ Flags |= getTargetMMOFlags(VPIntrin);
+ return Flags;
+}
+
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index d19ef923ef74..ae681b9aebdf 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -247,6 +247,8 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
break;
case Triple::riscv32:
case Triple::riscv64:
+ case Triple::riscv32be:
+ case Triple::riscv64be:
LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
dwarf::DW_EH_PE_sdata4;
@@ -1918,6 +1920,13 @@ void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer,
}
emitCGProfileMetadata(Streamer, M);
+ emitPseudoProbeDescMetadata(Streamer, M, [](MCStreamer &Streamer) {
+ if (MCSymbol *Sym =
+ static_cast<MCSectionCOFF *>(Streamer.getCurrentSectionOnly())
+ ->getCOMDATSymbol())
+ if (Sym->isUndefined())
+ Streamer.emitLabel(Sym);
+ });
}
void TargetLoweringObjectFileCOFF::emitLinkerDirectives(
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 7d7c6e743fa7..b6169e6c4dc3 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -134,12 +134,18 @@ static cl::opt<cl::boolOrDefault> DebugifyCheckAndStripAll(
static cl::opt<RunOutliner> EnableMachineOutliner(
"enable-machine-outliner", cl::desc("Enable the machine outliner"),
cl::Hidden, cl::ValueOptional, cl::init(RunOutliner::TargetDefault),
- cl::values(clEnumValN(RunOutliner::AlwaysOutline, "always",
- "Run on all functions guaranteed to be beneficial"),
- clEnumValN(RunOutliner::NeverOutline, "never",
- "Disable all outlining"),
- // Sentinel value for unspecified option.
- clEnumValN(RunOutliner::AlwaysOutline, "", "")));
+ cl::values(
+ clEnumValN(RunOutliner::AlwaysOutline, "always",
+ "Run on all functions guaranteed to be beneficial"),
+ clEnumValN(RunOutliner::OptimisticPGO, "optimistic-pgo",
+ "Outline cold code only. If a code block does not have "
+ "profile data, optimistically assume it is cold."),
+ clEnumValN(RunOutliner::ConservativePGO, "conservative-pgo",
+ "Outline cold code only. If a code block does not have "
+ "profile, data, conservatively assume it is hot."),
+ clEnumValN(RunOutliner::NeverOutline, "never", "Disable all outlining"),
+ // Sentinel value for unspecified option.
+ clEnumValN(RunOutliner::AlwaysOutline, "", "")));
static cl::opt<bool> EnableGlobalMergeFunc(
"enable-global-merge-func", cl::Hidden,
cl::desc("Enable global merge functions that are based on hash function"));
@@ -1074,7 +1080,7 @@ bool TargetPassConfig::addISelPasses() {
PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
addPass(createPreISelIntrinsicLoweringPass());
addPass(createExpandLargeDivRemPass());
- addPass(createExpandFpPass());
+ addPass(createExpandFpPass(getOptLevel()));
addIRPasses();
addCodeGenPrepare();
addPassesToHandleExceptions();
@@ -1224,12 +1230,9 @@ void TargetPassConfig::addMachinePasses() {
if (TM->Options.EnableMachineOutliner &&
getOptLevel() != CodeGenOptLevel::None &&
EnableMachineOutliner != RunOutliner::NeverOutline) {
- bool RunOnAllFunctions =
- (EnableMachineOutliner == RunOutliner::AlwaysOutline);
- bool AddOutliner =
- RunOnAllFunctions || TM->Options.SupportsDefaultOutlining;
- if (AddOutliner)
- addPass(createMachineOutlinerPass(RunOnAllFunctions));
+ if (EnableMachineOutliner != RunOutliner::TargetDefault ||
+ TM->Options.SupportsDefaultOutlining)
+ addPass(createMachineOutlinerPass(EnableMachineOutliner));
}
if (GCEmptyBlocks)