summaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/CodeMetrics.cpp53
-rw-r--r--llvm/lib/Analysis/LoopInfo.cpp20
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp11
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp1
-rw-r--r--llvm/lib/AsmParser/LLLexer.cpp1
-rw-r--r--llvm/lib/AsmParser/LLParser.cpp4
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp86
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp5
-rw-r--r--llvm/lib/CodeGen/GlobalISel/Utils.cpp2
-rw-r--r--llvm/lib/CodeGen/RegisterCoalescer.cpp5
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp16
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp7
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp24
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp11
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp3
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp8
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp17
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp10
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp4
-rw-r--r--llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp32
-rw-r--r--llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp2
-rw-r--r--llvm/lib/IR/AsmWriter.cpp3
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp4
-rw-r--r--llvm/lib/MC/WasmObjectWriter.cpp5
-rw-r--r--llvm/lib/Object/ELFObjectFile.cpp2
-rw-r--r--llvm/lib/ObjectYAML/ELFYAML.cpp1
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp1
-rw-r--r--llvm/lib/Passes/PassRegistry.def1
-rw-r--r--llvm/lib/ProfileData/MemProfReader.cpp2
-rw-r--r--llvm/lib/Support/VirtualFileSystem.cpp61
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.td8
-rw-r--r--llvm/lib/Target/AArch64/AArch64Processors.td30
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp33
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedOryon.td1659
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp7
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp19
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/GCNProcessors.td6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp103
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp103
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h3
-rw-r--r--llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp22
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp12
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.h1
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTX.h7
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp32
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXUtilities.cpp57
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXUtilities.h16
-rw-r--r--llvm/lib/Target/NVPTX/NVVMIntrRange.cpp197
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp5
-rw-r--r--llvm/lib/Target/PowerPC/PPCFastISel.cpp16
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp11
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstr64Bit.td6
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp16
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td73
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp47
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp149
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp12
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp3
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp9
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp31
-rw-r--r--llvm/lib/Target/X86/X86InstrAMX.td2
-rw-r--r--llvm/lib/Target/X86/X86LowerAMXType.cpp16
-rw-r--r--llvm/lib/Target/X86/X86LowerTileCopy.cpp23
-rw-r--r--llvm/lib/Target/X86/X86MachineFunctionInfo.h12
-rw-r--r--llvm/lib/Target/X86/X86SchedIceLake.td18
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeServer.td4
-rw-r--r--llvm/lib/TargetParser/Host.cpp1
-rw-r--r--llvm/lib/TargetParser/TargetParser.cpp4
-rw-r--r--llvm/lib/Transforms/IPO/AttributorAttributes.cpp17
-rw-r--r--llvm/lib/Transforms/IPO/CMakeLists.txt1
-rw-r--r--llvm/lib/Transforms/IPO/ExpandVariadics.cpp1012
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp251
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp25
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp6
-rw-r--r--llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp56
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp8
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp57
-rw-r--r--llvm/lib/Transforms/Utils/InlineFunction.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/LoopRotationUtils.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnroll.cpp47
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp17
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp3
100 files changed, 4032 insertions, 743 deletions
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
index 2637e2f97dbb..ea67b526423b 100644
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -16,6 +16,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/InstructionCost.h"
@@ -111,11 +112,24 @@ void CodeMetrics::collectEphemeralValues(
completeEphemeralValues(Visited, Worklist, EphValues);
}
+static bool extendsConvergenceOutsideLoop(const Instruction &I, const Loop *L) {
+ if (!L)
+ return false;
+ if (!isa<ConvergenceControlInst>(I))
+ return false;
+ for (const auto *U : I.users()) {
+ if (!L->contains(cast<Instruction>(U)))
+ return true;
+ }
+ return false;
+}
+
/// Fill in the current structure with information gleaned from the specified
/// block.
void CodeMetrics::analyzeBasicBlock(
const BasicBlock *BB, const TargetTransformInfo &TTI,
- const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) {
+ const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO,
+ const Loop *L) {
++NumBlocks;
InstructionCost NumInstsBeforeThisBB = NumInsts;
for (const Instruction &I : *BB) {
@@ -163,19 +177,38 @@ void CodeMetrics::analyzeBasicBlock(
if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
++NumVectorInsts;
- if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+ if (I.getType()->isTokenTy() && !isa<ConvergenceControlInst>(I) &&
+ I.isUsedOutsideOfBlock(BB)) {
+ LLVM_DEBUG(dbgs() << I
+ << "\n Cannot duplicate a token value used outside "
+ "the current block (except convergence control).\n");
notDuplicatable = true;
-
- if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
- if (CI->cannotDuplicate())
- notDuplicatable = true;
- if (CI->isConvergent())
- convergent = true;
}
- if (const InvokeInst *InvI = dyn_cast<InvokeInst>(&I))
- if (InvI->cannotDuplicate())
+ if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
+ if (CB->cannotDuplicate())
notDuplicatable = true;
+ // Compute a meet over the visited blocks for the following partial order:
+ //
+ // None -> { Controlled, ExtendedLoop, Uncontrolled}
+ // Controlled -> ExtendedLoop
+ if (Convergence <= ConvergenceKind::Controlled && CB->isConvergent()) {
+ if (isa<ConvergenceControlInst>(CB) ||
+ CB->getConvergenceControlToken()) {
+ assert(Convergence != ConvergenceKind::Uncontrolled);
+ LLVM_DEBUG(dbgs() << "Found controlled convergence:\n" << I << "\n");
+ if (extendsConvergenceOutsideLoop(I, L))
+ Convergence = ConvergenceKind::ExtendedLoop;
+ else {
+ assert(Convergence != ConvergenceKind::ExtendedLoop);
+ Convergence = ConvergenceKind::Controlled;
+ }
+ } else {
+ assert(Convergence == ConvergenceKind::None);
+ Convergence = ConvergenceKind::Uncontrolled;
+ }
+ }
+ }
NumInsts += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
}
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 369ab087ffc0..c34c4974382e 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -1105,6 +1105,26 @@ int llvm::getIntLoopAttribute(const Loop *TheLoop, StringRef Name,
return getOptionalIntLoopAttribute(TheLoop, Name).value_or(Default);
}
+CallBase *llvm::getLoopConvergenceHeart(const Loop *TheLoop) {
+ BasicBlock *H = TheLoop->getHeader();
+ for (Instruction &II : *H) {
+ if (auto *CB = dyn_cast<CallBase>(&II)) {
+ if (!CB->isConvergent())
+ continue;
+ // This is the heart if it uses a token defined outside the loop. The
+ // verifier has already checked that only the loop intrinsic can use such
+ // a token.
+ if (auto *Token = CB->getConvergenceControlToken()) {
+ auto *TokenDef = cast<Instruction>(Token);
+ if (!TheLoop->contains(TokenDef->getParent()))
+ return CB;
+ }
+ return nullptr;
+ }
+ }
+ return nullptr;
+}
+
bool llvm::isFinite(const Loop *L) {
return L->getHeader()->getParent()->willReturn();
}
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 08138a5e2f2d..782c28c94483 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7296,10 +7296,13 @@ static bool isGuaranteedNotToBeUndefOrPoison(
isa<ConstantPointerNull>(C) || isa<Function>(C))
return true;
- if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C))
- return (!includesUndef(Kind) ? !C->containsPoisonElement()
- : !C->containsUndefOrPoisonElement()) &&
- !C->containsConstantExpression();
+ if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C)) {
+ if (includesUndef(Kind) && C->containsUndefElement())
+ return false;
+ if (includesPoison(Kind) && C->containsPoisonElement())
+ return false;
+ return !C->containsConstantExpression();
+ }
}
// Strip cast operations from a pointer value.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 917094267d05..30728ed58750 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -68,6 +68,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::sqrt: // Begin floating-point.
case Intrinsic::sin:
case Intrinsic::cos:
+ case Intrinsic::tan:
case Intrinsic::exp:
case Intrinsic::exp2:
case Intrinsic::log:
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index d3ab306904da..7d7fe19568e8 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -604,6 +604,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(aarch64_vector_pcs);
KEYWORD(aarch64_sve_vector_pcs);
KEYWORD(aarch64_sme_preservemost_from_x0);
+ KEYWORD(aarch64_sme_preservemost_from_x1);
KEYWORD(aarch64_sme_preservemost_from_x2);
KEYWORD(msp430_intrcc);
KEYWORD(avr_intrcc);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 07c8aa23fc5e..f0fde9ae4df5 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2153,6 +2153,7 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) {
/// ::= 'aarch64_vector_pcs'
/// ::= 'aarch64_sve_vector_pcs'
/// ::= 'aarch64_sme_preservemost_from_x0'
+/// ::= 'aarch64_sme_preservemost_from_x1'
/// ::= 'aarch64_sme_preservemost_from_x2'
/// ::= 'msp430_intrcc'
/// ::= 'avr_intrcc'
@@ -2212,6 +2213,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
case lltok::kw_aarch64_sme_preservemost_from_x0:
CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0;
break;
+ case lltok::kw_aarch64_sme_preservemost_from_x1:
+ CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1;
+ break;
case lltok::kw_aarch64_sme_preservemost_from_x2:
CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2;
break;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
index b4765fb280f9..66b1c5f8ca82 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
@@ -6,7 +6,8 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements CombinerHelper for G_EXTRACT_VECTOR_ELT.
+// This file implements CombinerHelper for G_EXTRACT_VECTOR_ELT,
+// G_INSERT_VECTOR_ELT, and G_VSCALE
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
@@ -400,3 +401,86 @@ bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI,
return false;
}
+
+bool CombinerHelper::matchAddOfVScale(const MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GAdd *Add = cast<GAdd>(MRI.getVRegDef(MO.getReg()));
+ GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getLHSReg()));
+ GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getRHSReg()));
+
+ Register Dst = Add->getReg(0);
+
+ if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)) ||
+ !MRI.hasOneNonDBGUse(RHSVScale->getReg(0)))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildVScale(Dst, LHSVScale->getSrc() + RHSVScale->getSrc());
+ };
+
+ return true;
+}
+
+bool CombinerHelper::matchMulOfVScale(const MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GMul *Mul = cast<GMul>(MRI.getVRegDef(MO.getReg()));
+ GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Mul->getLHSReg()));
+
+ std::optional<APInt> MaybeRHS = getIConstantVRegVal(Mul->getRHSReg(), MRI);
+ if (!MaybeRHS)
+ return false;
+
+ Register Dst = MO.getReg();
+
+ if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildVScale(Dst, LHSVScale->getSrc() * *MaybeRHS);
+ };
+
+ return true;
+}
+
+bool CombinerHelper::matchSubOfVScale(const MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GSub *Sub = cast<GSub>(MRI.getVRegDef(MO.getReg()));
+ GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Sub->getRHSReg()));
+
+ Register Dst = MO.getReg();
+ LLT DstTy = MRI.getType(Dst);
+
+ if (!MRI.hasOneNonDBGUse(RHSVScale->getReg(0)) ||
+ !isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, DstTy}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto VScale = B.buildVScale(DstTy, -RHSVScale->getSrc());
+ B.buildAdd(Dst, Sub->getLHSReg(), VScale, Sub->getFlags());
+ };
+
+ return true;
+}
+
+bool CombinerHelper::matchShlOfVScale(const MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GShl *Shl = cast<GShl>(MRI.getVRegDef(MO.getReg()));
+ GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Shl->getSrcReg()));
+
+ std::optional<APInt> MaybeRHS = getIConstantVRegVal(Shl->getShiftReg(), MRI);
+ if (!MaybeRHS)
+ return false;
+
+ Register Dst = MO.getReg();
+ LLT DstTy = MRI.getType(Dst);
+
+ if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)) ||
+ !isLegalOrBeforeLegalizer({TargetOpcode::G_VSCALE, DstTy}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildVScale(Dst, LHSVScale->getSrc().shl(*MaybeRHS));
+ };
+
+ return true;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 6f0cae2edab1..9830b521797c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -449,6 +449,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
RTLIBCASE(SIN_F);
case TargetOpcode::G_FCOS:
RTLIBCASE(COS_F);
+ case TargetOpcode::G_FTAN:
+ RTLIBCASE(TAN_F);
case TargetOpcode::G_FLOG10:
RTLIBCASE(LOG10_F);
case TargetOpcode::G_FLOG:
@@ -1037,6 +1039,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
case TargetOpcode::G_FREM:
case TargetOpcode::G_FCOS:
case TargetOpcode::G_FSIN:
+ case TargetOpcode::G_FTAN:
case TargetOpcode::G_FLOG10:
case TargetOpcode::G_FLOG:
case TargetOpcode::G_FLOG2:
@@ -2893,6 +2896,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
case TargetOpcode::G_FFLOOR:
case TargetOpcode::G_FCOS:
case TargetOpcode::G_FSIN:
+ case TargetOpcode::G_FTAN:
case TargetOpcode::G_FLOG10:
case TargetOpcode::G_FLOG:
case TargetOpcode::G_FLOG2:
@@ -4659,6 +4663,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
case G_INTRINSIC_TRUNC:
case G_FCOS:
case G_FSIN:
+ case G_FTAN:
case G_FSQRT:
case G_BSWAP:
case G_BITREVERSE:
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index e8438be94b3c..129e6963aef3 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -833,6 +833,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
case TargetOpcode::G_FREM:
case TargetOpcode::G_FSIN:
case TargetOpcode::G_FCOS:
+ case TargetOpcode::G_FTAN:
case TargetOpcode::G_FMA:
case TargetOpcode::G_FMAD:
if (SNaN)
@@ -1713,6 +1714,7 @@ bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) {
case TargetOpcode::G_FREM:
case TargetOpcode::G_FRINT:
case TargetOpcode::G_FSIN:
+ case TargetOpcode::G_FTAN:
case TargetOpcode::G_FSQRT:
case TargetOpcode::G_FSUB:
case TargetOpcode::G_INTRINSIC_ROUND:
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 3397bd0a6060..a808a541103f 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1339,14 +1339,13 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
if (SrcIdx && DstIdx)
return false;
- [[maybe_unused]] const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
+ const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
if (!DefMI->isImplicitDef()) {
if (DstReg.isPhysical()) {
Register NewDstReg = DstReg;
- unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
- DefMI->getOperand(0).getSubReg());
+ unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), DefSubIdx);
if (NewDstIdx)
NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9a5359015439..02cd125eeff0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4041,17 +4041,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
return DAG.getNode(ISD::ADD, DL, VT, N0, SExt);
}
- // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X)
- if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
- if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
- SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
- SDValue S0 = N1.getOperand(0);
- if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0))
- if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
- if (C->getAPIntValue() == (BitWidth - 1))
- return DAG.getNode(ISD::ABS, DL, VT, S0);
- }
- }
+ // fold B = sra (A, size(A)-1); sub (xor (A, B), B) -> (abs A)
+ if (hasOperation(ISD::ABS, VT) &&
+ sd_match(N1, m_Sra(m_Value(A), m_SpecificInt(BitWidth - 1))) &&
+ sd_match(N0, m_Xor(m_Specific(A), m_Specific(N1))))
+ return DAG.getNode(ISD::ABS, DL, VT, A);
// If the relocation model supports it, consider symbol offsets.
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 8cd2bb60d81f..27c45cab2e0d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4514,6 +4514,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
RTLIB::COS_F80, RTLIB::COS_F128,
RTLIB::COS_PPCF128, Results);
break;
+ case ISD::FTAN:
+ case ISD::STRICT_FTAN:
+ ExpandFPLibCall(Node, RTLIB::TAN_F32, RTLIB::TAN_F64, RTLIB::TAN_F80,
+ RTLIB::TAN_F128, RTLIB::TAN_PPCF128, Results);
+ break;
case ISD::FSINCOS:
// Expand into sincos libcall.
ExpandSinCosLibCall(Node, Results);
@@ -5468,6 +5473,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
case ISD::FSQRT:
case ISD::FSIN:
case ISD::FCOS:
+ case ISD::FTAN:
case ISD::FLOG:
case ISD::FLOG2:
case ISD::FLOG10:
@@ -5492,6 +5498,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
case ISD::STRICT_FSQRT:
case ISD::STRICT_FSIN:
case ISD::STRICT_FCOS:
+ case ISD::STRICT_FTAN:
case ISD::STRICT_FLOG:
case ISD::STRICT_FLOG2:
case ISD::STRICT_FLOG10:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index fb1424f75e09..aa116c9de5d8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -131,6 +131,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
case ISD::FSQRT: R = SoftenFloatRes_FSQRT(N); break;
case ISD::STRICT_FSUB:
case ISD::FSUB: R = SoftenFloatRes_FSUB(N); break;
+ case ISD::STRICT_FTAN:
+ case ISD::FTAN: R = SoftenFloatRes_FTAN(N); break;
case ISD::STRICT_FTRUNC:
case ISD::FTRUNC: R = SoftenFloatRes_FTRUNC(N); break;
case ISD::LOAD: R = SoftenFloatRes_LOAD(N); break;
@@ -774,6 +776,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
RTLIB::SUB_PPCF128));
}
+SDValue DAGTypeLegalizer::SoftenFloatRes_FTAN(SDNode *N) {
+ return SoftenFloatRes_Unary(
+ N, GetFPLibCall(N->getValueType(0), RTLIB::TAN_F32, RTLIB::TAN_F64,
+ RTLIB::TAN_F80, RTLIB::TAN_F128, RTLIB::TAN_PPCF128));
+}
+
SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
RTLIB::TRUNC_F32,
@@ -1330,7 +1338,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
#endif
report_fatal_error("Do not know how to expand the result of this "
"operator!");
-
+ // clang-format off
case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break;
case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break;
case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break;
@@ -1399,6 +1407,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
case ISD::FSQRT: ExpandFloatRes_FSQRT(N, Lo, Hi); break;
case ISD::STRICT_FSUB:
case ISD::FSUB: ExpandFloatRes_FSUB(N, Lo, Hi); break;
+ case ISD::STRICT_FTAN:
+ case ISD::FTAN: ExpandFloatRes_FTAN(N, Lo, Hi); break;
case ISD::STRICT_FTRUNC:
case ISD::FTRUNC: ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
case ISD::LOAD: ExpandFloatRes_LOAD(N, Lo, Hi); break;
@@ -1408,6 +1418,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
case ISD::STRICT_FREM:
case ISD::FREM: ExpandFloatRes_FREM(N, Lo, Hi); break;
+ // clang-format on
}
// If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -1768,6 +1779,15 @@ void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
RTLIB::SUB_PPCF128), Lo, Hi);
}
+void DAGTypeLegalizer::ExpandFloatRes_FTAN(SDNode *N, SDValue &Lo,
+ SDValue &Hi) {
+ ExpandFloatRes_Unary(N,
+ GetFPLibCall(N->getValueType(0), RTLIB::TAN_F32,
+ RTLIB::TAN_F64, RTLIB::TAN_F80,
+ RTLIB::TAN_F128, RTLIB::TAN_PPCF128),
+ Lo, Hi);
+}
+
void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N,
SDValue &Lo, SDValue &Hi) {
ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
@@ -2479,6 +2499,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
case ISD::FSIN:
case ISD::FSQRT:
case ISD::FTRUNC:
+ case ISD::FTAN:
case ISD::FCANONICALIZE: R = PromoteFloatRes_UnaryOp(N); break;
// Binary FP Operations
@@ -2914,6 +2935,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
case ISD::FSIN:
case ISD::FSQRT:
case ISD::FTRUNC:
+ case ISD::FTAN:
case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break;
// Binary FP Operations
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index bec9cb49b586..2350b562a034 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -586,6 +586,7 @@ private:
SDValue SoftenFloatRes_FSIN(SDNode *N);
SDValue SoftenFloatRes_FSQRT(SDNode *N);
SDValue SoftenFloatRes_FSUB(SDNode *N);
+ SDValue SoftenFloatRes_FTAN(SDNode *N);
SDValue SoftenFloatRes_FTRUNC(SDNode *N);
SDValue SoftenFloatRes_LOAD(SDNode *N);
SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N);
@@ -635,6 +636,7 @@ private:
SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_Binary(SDNode *N, RTLIB::Libcall LC,
SDValue &Lo, SDValue &Hi);
+ // clang-format off
void ExpandFloatRes_FABS (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FMINNUM (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FMAXNUM (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -667,9 +669,11 @@ private:
void ExpandFloatRes_FSIN (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FSQRT (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FSUB (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandFloatRes_FTAN (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_FTRUNC (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi);
+ // clang-format on
// Float Operand Expansion.
bool ExpandFloatOperand(SDNode *N, unsigned OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 6acbc044d673..8cdb4ba0ade6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -397,6 +397,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::FSQRT:
case ISD::FSIN:
case ISD::FCOS:
+ case ISD::FTAN:
case ISD::FLDEXP:
case ISD::FPOWI:
case ISD::FPOW:
@@ -506,7 +507,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
break; \
} \
/* Defer non-vector results to LegalizeDAG. */ \
- if (!Node->getValueType(0).isVector()) { \
+ if (!Node->getValueType(0).isVector() && \
+ Node->getValueType(0) != MVT::Other) { \
Action = TargetLowering::Legal; \
break; \
} \
@@ -990,11 +992,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
break;
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
- if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) {
- Results.push_back(Expanded);
- return;
- }
- break;
+ Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG));
+ return;
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 361416edb554..92ce3b17ed6c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -108,6 +108,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::FROUNDEVEN:
case ISD::FSIN:
case ISD::FSQRT:
+ case ISD::FTAN:
case ISD::FTRUNC:
case ISD::SIGN_EXTEND:
case ISD::SINT_TO_FP:
@@ -1140,6 +1141,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::VP_FROUNDEVEN:
case ISD::FSIN:
case ISD::FSQRT: case ISD::VP_SQRT:
+ case ISD::FTAN:
case ISD::FTRUNC:
case ISD::VP_FROUNDTOZERO:
case ISD::SINT_TO_FP:
@@ -4400,6 +4402,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
case ISD::FROUNDEVEN:
case ISD::FSIN:
case ISD::FSQRT:
+ case ISD::FTAN:
case ISD::FTRUNC:
if (unrollExpandedOp())
break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4a6a431696b5..e176cf2cc2a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5375,6 +5375,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
case ISD::FREM:
case ISD::FSIN:
case ISD::FCOS:
+ case ISD::FTAN:
case ISD::FMA:
case ISD::FMAD: {
if (SNaN)
@@ -6332,7 +6333,8 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
}
SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
- EVT VT, ArrayRef<SDValue> Ops) {
+ EVT VT, ArrayRef<SDValue> Ops,
+ SDNodeFlags Flags) {
// If the opcode is a target-specific ISD node, there's nothing we can
// do here and the operand rules may not line up with the below, so
// bail early.
@@ -6689,7 +6691,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
}
// Constant fold the scalar operands.
- SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps);
+ SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);
// Legalize the (integer) scalar constant if necessary.
if (LegalSVT != SVT)
@@ -7260,7 +7262,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
}
// Perform trivial constant folding.
- if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}))
+ if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
return SV;
// Canonicalize an UNDEF to the RHS, even over a constant.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ba76456b5836..2f3626f1c820 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1684,7 +1684,7 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values,
if (!FragmentExpr)
continue;
SDDbgValue *SDV = DAG.getVRegDbgValue(
- Var, *FragmentExpr, RegAndSize.first, false, DbgLoc, SDNodeOrder);
+ Var, *FragmentExpr, RegAndSize.first, false, DbgLoc, Order);
DAG.AddDbgValue(SDV, false);
Offset += RegisterSize;
}
@@ -1699,11 +1699,10 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values,
}
// We have created a SDDbgOperand for each Value in Values.
- // Should use Order instead of SDNodeOrder?
assert(!LocationOps.empty());
- SDDbgValue *SDV = DAG.getDbgValueList(Var, Expr, LocationOps, Dependencies,
- /*IsIndirect=*/false, DbgLoc,
- SDNodeOrder, IsVariadic);
+ SDDbgValue *SDV =
+ DAG.getDbgValueList(Var, Expr, LocationOps, Dependencies,
+ /*IsIndirect=*/false, DbgLoc, Order, IsVariadic);
DAG.AddDbgValue(SDV, /*isParameter=*/false);
return true;
}
@@ -6742,6 +6741,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
case Intrinsic::fabs:
case Intrinsic::sin:
case Intrinsic::cos:
+ case Intrinsic::tan:
case Intrinsic::exp10:
case Intrinsic::floor:
case Intrinsic::ceil:
@@ -6759,6 +6759,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
case Intrinsic::fabs: Opcode = ISD::FABS; break;
case Intrinsic::sin: Opcode = ISD::FSIN; break;
case Intrinsic::cos: Opcode = ISD::FCOS; break;
+ case Intrinsic::tan: Opcode = ISD::FTAN; break;
case Intrinsic::exp10: Opcode = ISD::FEXP10; break;
case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
@@ -9160,6 +9161,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
if (visitUnaryFloatCall(I, ISD::FCOS))
return;
break;
+ case LibFunc_tan:
+ case LibFunc_tanf:
+ case LibFunc_tanl:
+ if (visitUnaryFloatCall(I, ISD::FTAN))
+ return;
+ break;
case LibFunc_sqrt:
case LibFunc_sqrtf:
case LibFunc_sqrtl:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 2198c2354483..52da24b59451 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -210,6 +210,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::FCOS: return "fcos";
case ISD::STRICT_FCOS: return "strict_fcos";
case ISD::FSINCOS: return "fsincos";
+ case ISD::FTAN: return "ftan";
+ case ISD::STRICT_FTAN: return "strict_ftan";
case ISD::FTRUNC: return "ftrunc";
case ISD::STRICT_FTRUNC: return "strict_ftrunc";
case ISD::FFLOOR: return "ffloor";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f856c8a51984..e1c1a6b09b11 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8427,10 +8427,6 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
bool IsMax = Opc == ISD::FMAXIMUM;
SDNodeFlags Flags = N->getFlags();
- if (VT.isVector() &&
- isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
- return SDValue();
-
// First, implement comparison not propagating NaN. If no native fmin or fmax
// available, use plain select with setcc instead.
SDValue MinMax;
@@ -8447,6 +8443,9 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
} else if (isOperationLegalOrCustom(CompOpc, VT)) {
MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags);
} else {
+ if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return DAG.UnrollVectorOp(N);
+
// NaN (if exists) will be propagated later, so orderness doesn't matter.
SDValue Compare =
DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
@@ -9159,6 +9158,7 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
if (!IsNegative && isOperationLegal(ISD::SUB, VT) &&
isOperationLegal(ISD::SMAX, VT)) {
SDValue Zero = DAG.getConstant(0, dl, VT);
+ Op = DAG.getFreeze(Op);
return DAG.getNode(ISD::SMAX, dl, VT, Op,
DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
}
@@ -9175,8 +9175,8 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
// 0 - abs(x) -> smin(x, sub(0,x))
if (IsNegative && isOperationLegal(ISD::SUB, VT) &&
isOperationLegal(ISD::SMIN, VT)) {
- Op = DAG.getFreeze(Op);
SDValue Zero = DAG.getConstant(0, dl, VT);
+ Op = DAG.getFreeze(Op);
return DAG.getNode(ISD::SMIN, dl, VT, Op,
DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
}
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 3aec7049e0cc..8240a1fd7e2f 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -141,6 +141,7 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
setLibcallName(RTLIB::EXP10_F128, "exp10f128");
setLibcallName(RTLIB::SIN_F128, "sinf128");
setLibcallName(RTLIB::COS_F128, "cosf128");
+ setLibcallName(RTLIB::TAN_F128, "tanf128");
setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
setLibcallName(RTLIB::POW_F128, "powf128");
setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
@@ -1015,7 +1016,8 @@ void TargetLoweringBase::initActions() {
setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::FEXP,
ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR, ISD::FNEARBYINT,
ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, ISD::LROUND,
- ISD::LLROUND, ISD::LRINT, ISD::LLRINT, ISD::FROUNDEVEN},
+ ISD::LLROUND, ISD::LRINT, ISD::LLRINT, ISD::FROUNDEVEN,
+ ISD::FTAN},
{MVT::f32, MVT::f64, MVT::f128}, Expand);
// Default ISD::TRAP to expand (which turns it into abort).
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 004622061120..f44a6a472cb6 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -1183,8 +1183,7 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID,
StubMap::const_iterator i = Stubs.find(Value);
if (i != Stubs.end()) {
resolveRelocation(Section, Offset,
- (uint64_t)Section.getAddressWithOffset(i->second),
- RelType, 0);
+ Section.getLoadAddressWithOffset(i->second), RelType, 0);
LLVM_DEBUG(dbgs() << " Stub function found\n");
} else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
// Create a new stub function.
@@ -1217,8 +1216,7 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID,
addRelocationForSection(REmovk_g0, Value.SectionID);
}
resolveRelocation(Section, Offset,
- reinterpret_cast<uint64_t>(Section.getAddressWithOffset(
- Section.getStubOffset())),
+ Section.getLoadAddressWithOffset(Section.getStubOffset()),
RelType, 0);
Section.advanceStubOffset(getMaxStubSize());
}
@@ -1349,10 +1347,9 @@ RuntimeDyldELF::processRelocationRef(
// Look for an existing stub.
StubMap::const_iterator i = Stubs.find(Value);
if (i != Stubs.end()) {
- resolveRelocation(
- Section, Offset,
- reinterpret_cast<uint64_t>(Section.getAddressWithOffset(i->second)),
- RelType, 0);
+ resolveRelocation(Section, Offset,
+ Section.getLoadAddressWithOffset(i->second), RelType,
+ 0);
LLVM_DEBUG(dbgs() << " Stub function found\n");
} else {
// Create a new stub function.
@@ -1367,10 +1364,10 @@ RuntimeDyldELF::processRelocationRef(
else
addRelocationForSection(RE, Value.SectionID);
- resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>(
- Section.getAddressWithOffset(
- Section.getStubOffset())),
- RelType, 0);
+ resolveRelocation(
+ Section, Offset,
+ Section.getLoadAddressWithOffset(Section.getStubOffset()), RelType,
+ 0);
Section.advanceStubOffset(getMaxStubSize());
}
} else {
@@ -1609,8 +1606,7 @@ RuntimeDyldELF::processRelocationRef(
if (i != Stubs.end()) {
// Symbol function stub already created, just relocate to it
resolveRelocation(Section, Offset,
- reinterpret_cast<uint64_t>(
- Section.getAddressWithOffset(i->second)),
+ Section.getLoadAddressWithOffset(i->second),
RelType, 0);
LLVM_DEBUG(dbgs() << " Stub function found\n");
} else {
@@ -1652,10 +1648,10 @@ RuntimeDyldELF::processRelocationRef(
addRelocationForSection(REl, Value.SectionID);
}
- resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>(
- Section.getAddressWithOffset(
- Section.getStubOffset())),
- RelType, 0);
+ resolveRelocation(
+ Section, Offset,
+ Section.getLoadAddressWithOffset(Section.getStubOffset()),
+ RelType, 0);
Section.advanceStubOffset(getMaxStubSize());
}
if (IsExtern || (AbiVariant == 2 && Value.SectionID != SectionID)) {
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2c4b45255d05..92213e19c9d9 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3961,7 +3961,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
// Loop is not unrollable if the loop contains certain instructions.
- if (!UCE.canUnroll() || UCE.Convergent) {
+ if (!UCE.canUnroll()) {
LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
return 1;
}
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 7a5f18fe2cbd..0bf8be9ac55f 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -326,6 +326,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
Out << "aarch64_sme_preservemost_from_x0";
break;
+ case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1:
+ Out << "aarch64_sme_preservemost_from_x1";
+ break;
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
Out << "aarch64_sme_preservemost_from_x2";
break;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a7ed2de6e8a5..2f4b8351e747 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5368,8 +5368,8 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
return DL.empty() ? std::string("G1") : (DL + "-G1").str();
}
- if (T.isRISCV64()) {
- // Make i32 a native type for 64-bit RISC-V.
+ if (T.isLoongArch64() || T.isRISCV64()) {
+ // Make i32 a native type for 64-bit LoongArch and RISC-V.
auto I = DL.find("-n64-");
if (I != StringRef::npos)
return (DL.take_front(I) + "-n32:64-" + DL.drop_front(I + 5)).str();
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 985f9351f4a3..788e92f94b26 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -877,7 +877,7 @@ void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports,
break;
case wasm::WASM_EXTERNAL_TABLE:
W->OS << char(Import.Table.ElemType);
- encodeULEB128(0, W->OS); // flags
+ encodeULEB128(Import.Table.Limits.Flags, W->OS);
encodeULEB128(NumElements, W->OS); // initial
break;
case wasm::WASM_EXTERNAL_TAG:
@@ -1022,7 +1022,8 @@ void WasmObjectWriter::writeElemSection(
encodeULEB128(TableNumber, W->OS); // the table number
// init expr for starting offset
- W->OS << char(wasm::WASM_OPCODE_I32_CONST);
+ W->OS << char(is64Bit() ? wasm::WASM_OPCODE_I64_CONST
+ : wasm::WASM_OPCODE_I32_CONST);
encodeSLEB128(InitialTableOffset, W->OS);
W->OS << char(wasm::WASM_OPCODE_END);
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 2b6bdbf24afa..cbc55a145e0e 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -586,6 +586,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
return "gfx1150";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151:
return "gfx1151";
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152:
+ return "gfx1152";
// AMDGCN GFX12.
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200:
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 8e2a9481c922..0fee299994bc 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -611,6 +611,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1103, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1150, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1151, EF_AMDGPU_MACH);
+ BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1152, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 316d05bf1dc3..8dd060d0151a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -139,6 +139,7 @@
#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
#include "llvm/Transforms/IPO/ElimAvailExtern.h"
#include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
#include "llvm/Transforms/IPO/FunctionAttrs.h"
#include "llvm/Transforms/IPO/FunctionImport.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 50682ca4970f..dad97146a9f6 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -59,6 +59,7 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
MODULE_PASS("dxil-upgrade", DXILUpgradePass())
MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
+MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::Disable))
MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
MODULE_PASS("function-import", FunctionImportPass())
MODULE_PASS("globalopt", GlobalOptPass())
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index fc3be716087e..693897f874a2 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -690,7 +690,7 @@ Error RawMemProfReader::readNextRecord(
return F;
auto Iter = this->GuidToSymbolName.find(F.Function);
assert(Iter != this->GuidToSymbolName.end());
- F.SymbolName = Iter->getSecond();
+ F.SymbolName = std::make_unique<std::string>(Iter->getSecond());
return F;
};
return MemProfReader::readNextRecord(GuidRecord, IdToFrameCallback);
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index fcefdef992be..7360901f2962 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -867,21 +867,16 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
// Any intermediate directories we create should be accessible by
// the owner, even if Perms says otherwise for the final path.
const auto NewDirectoryPerms = ResolvedPerms | sys::fs::owner_all;
+
+ StringRef Name = *I;
while (true) {
- StringRef Name = *I;
- detail::InMemoryNode *Node = Dir->getChild(Name);
+ Name = *I;
++I;
+ if (I == E)
+ break;
+ detail::InMemoryNode *Node = Dir->getChild(Name);
if (!Node) {
- if (I == E) {
- // End of the path.
- Dir->addChild(
- Name, MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime,
- std::move(Buffer), ResolvedUser, ResolvedGroup,
- ResolvedType, ResolvedPerms}));
- return true;
- }
-
- // Create a new directory. Use the path up to here.
+ // This isn't the last element, so we create a new directory.
Status Stat(
StringRef(Path.str().begin(), Name.end() - Path.str().begin()),
getDirectoryID(Dir->getUniqueID(), Name),
@@ -891,27 +886,33 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
Name, std::make_unique<detail::InMemoryDirectory>(std::move(Stat))));
continue;
}
+ // Creating file under another file.
+ if (!isa<detail::InMemoryDirectory>(Node))
+ return false;
+ Dir = cast<detail::InMemoryDirectory>(Node);
+ }
+ detail::InMemoryNode *Node = Dir->getChild(Name);
+ if (!Node) {
+ Dir->addChild(Name,
+ MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime,
+ std::move(Buffer), ResolvedUser, ResolvedGroup,
+ ResolvedType, ResolvedPerms}));
+ return true;
+ }
+ if (isa<detail::InMemoryDirectory>(Node))
+ return ResolvedType == sys::fs::file_type::directory_file;
- if (auto *NewDir = dyn_cast<detail::InMemoryDirectory>(Node)) {
- Dir = NewDir;
- } else {
- assert((isa<detail::InMemoryFile>(Node) ||
- isa<detail::InMemoryHardLink>(Node)) &&
- "Must be either file, hardlink or directory!");
-
- // Trying to insert a directory in place of a file.
- if (I != E)
- return false;
+ assert((isa<detail::InMemoryFile>(Node) ||
+ isa<detail::InMemoryHardLink>(Node)) &&
+ "Must be either file, hardlink or directory!");
- // Return false only if the new file is different from the existing one.
- if (auto Link = dyn_cast<detail::InMemoryHardLink>(Node)) {
- return Link->getResolvedFile().getBuffer()->getBuffer() ==
- Buffer->getBuffer();
- }
- return cast<detail::InMemoryFile>(Node)->getBuffer()->getBuffer() ==
- Buffer->getBuffer();
- }
+ // Return false only if the new file is different from the existing one.
+ if (auto *Link = dyn_cast<detail::InMemoryHardLink>(Node)) {
+ return Link->getResolvedFile().getBuffer()->getBuffer() ==
+ Buffer->getBuffer();
}
+ return cast<detail::InMemoryFile>(Node)->getBuffer()->getBuffer() ==
+ Buffer->getBuffer();
}
bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 4b2ce0d73949..5708b6173750 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -85,6 +85,10 @@ def SMEUnsupported : AArch64Unsupported {
SME2Unsupported.F);
}
+def MTEUnsupported : AArch64Unsupported {
+ let F = [HasMTE];
+}
+
let F = [HasPAuth, HasPAuthLR] in
def PAUnsupported : AArch64Unsupported;
@@ -109,6 +113,7 @@ include "AArch64SchedNeoverseN1.td"
include "AArch64SchedNeoverseN2.td"
include "AArch64SchedNeoverseV1.td"
include "AArch64SchedNeoverseV2.td"
+include "AArch64SchedOryon.td"
include "AArch64Processors.td"
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 32646c6ee689..941990c53c4a 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -589,6 +589,14 @@ def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
(sequence "X%u",19, 28),
LR, FP)>;
+// SME ABI support routines such as __arm_get_current_vg preserve most registers.
+def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
+ : CalleeSavedRegs<(add (sequence "Z%u", 0, 31),
+ (sequence "P%u", 0, 15),
+ (sequence "X%u", 1, 15),
+ (sequence "X%u",19, 28),
+ LR, FP)>;
+
// SME ABI support routines __arm_sme_state preserves most registers.
def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
: CalleeSavedRegs<(add (sequence "Z%u", 0, 31),
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 8d16709114df..a759efcd9441 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -617,6 +617,27 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
FeatureLdpAlignedOnly,
FeatureStpAlignedOnly]>;
+def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
+ "Oryon",
+ "Nuvia Inc Oryon processors", [
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureFuseAES,
+ FeatureFuseAdrpAdd,
+ FeatureEnableSelectOptimize,
+ FeatureFuseCryptoEOR,
+ FeatureFuseAddress,
+ FeatureSM4,
+ FeatureSHA2,
+ FeatureSHA3,
+ FeatureAES,
+ FeatureFullFP16,
+ FeatureFP16FML,
+ FeaturePerfMon,
+ FeatureSPE,
+ FeaturePostRAScheduler,
+ HasV8_6aOps]>;
def ProcessorFeatures {
list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
@@ -806,6 +827,11 @@ def ProcessorFeatures {
FeatureSHA3, FeatureAES, FeatureCSSC,
FeatureWFxT, FeatureFullFP16];
+ list<SubtargetFeature> Oryon = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+ FeatureCrypto, FeatureRandGen,
+ FeaturePAuth, FeatureSM4, FeatureSHA2,
+ FeatureSHA3, FeatureAES];
+
// ETE and TRBE are future architecture extensions. We temporarily enable them
// by default for users targeting generic AArch64. The extensions do not
// affect code generated by the compiler and can be used only by explicitly
@@ -988,3 +1014,7 @@ def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A,
def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B,
[TuneAmpere1B]>;
+
+// Qualcomm Oryon
+def : ProcessorModel<"oryon-1", OryonModel, ProcessorFeatures.Oryon,
+ [TuneOryon]>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index e97d7e3b6ed8..cc50b59dd8d7 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -107,13 +107,22 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
if (MF->getFunction().getCallingConv() ==
CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0)
report_fatal_error(
- "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is "
- "only supported to improve calls to SME ACLE save/restore/disable-za "
+ "Calling convention "
+ "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is only "
+ "supported to improve calls to SME ACLE save/restore/disable-za "
"functions, and is not intended to be used beyond that scope.");
if (MF->getFunction().getCallingConv() ==
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)
+ report_fatal_error(
+ "Calling convention "
+ "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 is "
+ "only supported to improve calls to SME ACLE __arm_get_current_vg "
+ "function, and is not intended to be used beyond that scope.");
+ if (MF->getFunction().getCallingConv() ==
CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)
report_fatal_error(
- "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is "
+ "Calling convention "
+ "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is "
"only supported to improve calls to SME ACLE __arm_sme_state "
"and is not intended to be used beyond that scope.");
if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
@@ -153,13 +162,22 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
if (MF->getFunction().getCallingConv() ==
CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0)
report_fatal_error(
- "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is "
+ "Calling convention "
+ "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is "
"only supported to improve calls to SME ACLE save/restore/disable-za "
"functions, and is not intended to be used beyond that scope.");
if (MF->getFunction().getCallingConv() ==
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)
+ report_fatal_error(
+ "Calling convention "
+ "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 is "
+ "only supported to improve calls to SME ACLE __arm_get_current_vg "
+ "function, and is not intended to be used beyond that scope.");
+ if (MF->getFunction().getCallingConv() ==
CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)
report_fatal_error(
- "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is "
+ "Calling convention "
+ "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is "
"only supported to improve calls to SME ACLE __arm_sme_state "
"and is not intended to be used beyond that scope.");
if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
@@ -236,6 +254,8 @@ AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF,
"Calling convention SVE_VectorCall is unsupported on Darwin.");
if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0)
return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0_RegMask;
+ if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)
+ return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1_RegMask;
if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)
return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2_RegMask;
if (CC == CallingConv::CFGuard_Check)
@@ -282,6 +302,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
: CSR_AArch64_SVE_AAPCS_RegMask;
if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0)
return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0_RegMask;
+ if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)
+ return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1_RegMask;
if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)
return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2_RegMask;
if (CC == CallingConv::CFGuard_Check)
@@ -643,6 +665,7 @@ bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF,
case CallingConv::AArch64_VectorCall:
case CallingConv::AArch64_SVE_VectorCall:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
+ case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
if (STI.isTargetWindows())
return HasReg(CC_AArch64_Win64PCS_ArgRegs, Reg);
diff --git a/llvm/lib/Target/AArch64/AArch64SchedOryon.td b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
new file mode 100644
index 000000000000..09d1af248f0e
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
@@ -0,0 +1,1659 @@
+//=- AArch64SchedOryon.td - Qualcomm Oryon CPU 001 ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for Qualcomm Oryon
+// family of processors.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pipeline Description.
+
+def OryonModel : SchedMachineModel {
+ let IssueWidth = 14;
+ let MicroOpBufferSize = 376;
+ let LoadLatency = 4;
+ let MispredictPenalty = 13; // 13 cycles for mispredicted branch.
+ let LoopMicroOpBufferSize = 0; // Do not have a LoopMicroOpBuffer
+ let PostRAScheduler = 1; // Using PostRA sched.
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ SMEUnsupported.F,
+ MTEUnsupported.F,
+ PAUnsupported.F,
+ [HasPAuth, HasCSSC]);
+}
+
+let SchedModel = OryonModel in {
+
+// Issue ports.
+// IXU has 6 ports p0 ~ p5
+// LSU has 4 ports p6 ~ p9(ls0 ~ ls3), p10/p11(std0, std1) has to work with ls0~ls3
+// VXU has 4 ports p12 ~ p15
+
+// cross IXU/LSU/VXU resource group for FMOV P41 of VXU
+// I2V
+def ORYONI4FP0 : ProcResource<1>;
+def ORYONI5FP1 : ProcResource<1>;
+// V2I
+def ORYONFP0I4 : ProcResource<1>;
+def ORYONFP1I5 : ProcResource<1>;
+
+// store 1 for normal store instructions
+def ORYONST0 : ProcResource<1>;
+// store 2 for normal store instructions
+def ORYONST1 : ProcResource<1>;
+
+// Port 0: ALU/Indirect/Direct Branch.
+def ORYONP0 : ProcResource<1>;
+
+// Port 1: ALU/Direct Branch.
+def ORYONP1 : ProcResource<1>;
+
+// Port 2: ALU.
+def ORYONP2 : ProcResource<1>;
+
+// Port 3: ALU.
+def ORYONP3 : ProcResource<1>;
+
+// Port 4: ALU.
+def ORYONP4 : ProcResource<1> {
+ let Super = ORYONI4FP0;
+ let Super = ORYONFP0I4; }
+
+// Port 5: ALU.
+def ORYONP5 : ProcResource<1> {
+ let Super = ORYONI5FP1;
+ let Super = ORYONFP1I5; }
+
+// Port 6: Load/Store. LS0
+def ORYONP6 : ProcResource<1> {
+ let Super = ORYONST0; }
+
+// Port 7: Load/store. LS1
+def ORYONP7 : ProcResource<1> {
+ let Super = ORYONST0; }
+
+// Port 8: Load/Store. LS2
+def ORYONP8 : ProcResource<1> {
+ let Super = ORYONST1; }
+
+// Port 9: Load/store. LS3
+def ORYONP9 : ProcResource<1> {
+ let Super = ORYONST1; }
+
+// Port 10: Load/Store. STD0
+def ORYONP10SD0 : ProcResource<1> {
+ let Super = ORYONST0; }
+
+// Port 11: Load/store. STD1
+def ORYONP11SD1 : ProcResource<1> {
+ let Super = ORYONST1; }
+
+// Port 12: FP/Neon/SIMD/Crypto.
+def ORYONP12FP0 : ProcResource<1> {
+ let Super = ORYONI4FP0;
+ let Super = ORYONFP0I4; }
+
+// Port 13: FP/Neon/SIMD/Crypto.
+def ORYONP13FP1 : ProcResource<1> {
+ let Super = ORYONI5FP1;
+ let Super = ORYONFP1I5; }
+
+// Port 14: FP/Neon/SIMD/Crypto.
+def ORYONP14FP2 : ProcResource<1>;
+
+// Port 15: FP/Neon/SIMD/Crypto.
+def ORYONP15FP3 : ProcResource<1>;
+
+// Define groups for the functional units on each issue port. Each group
+// created will be used by a WriteRes.
+
+// Integer add/shift/logical/misc. instructions on port I0/I1/I2/I3/I4/I5.
+def ORYONI012345 : ProcResGroup<[ORYONP0, ORYONP1, ORYONP2,
+ ORYONP3, ORYONP4, ORYONP5]> {
+ let BufferSize = 120;
+}
+
+// Direct Conditional Branch instructions on ports I0/I1.
+def ORYONI01 : ProcResGroup<[ORYONP0, ORYONP1]> {
+ let BufferSize = 40;
+}
+
+// Indirect/crypto Conditional Branch instructions on ports I0.
+def ORYONI0 : ProcResGroup<[ORYONP0]> {
+ let BufferSize = 20;
+}
+
+// Crypto/CRC/PAU instructions on ports I2.
+def ORYONI2 : ProcResGroup<[ORYONP2]> {
+ let BufferSize = 20;
+}
+
+// Multiply/Multiply-ADD instructions on ports I4/I5.
+def ORYONI45 : ProcResGroup<[ORYONP4, ORYONP5]> {
+ let BufferSize = 40;
+}
+
+// Divide instructions on ports I5.
+def ORYONI5 : ProcResGroup<[ORYONP5]> {
+ let BufferSize = 20;
+}
+
+// Comparison instructions on ports I0/I1/I2/I3.
+def ORYONI0123 : ProcResGroup<[ORYONP0, ORYONP1,
+ ORYONP2, ORYONP3]> {
+ let BufferSize = 80;
+}
+
+// Load instructions on ports P6/P7/P8/P9.
+def ORYONLD : ProcResGroup<[ORYONP6, ORYONP7, ORYONP8, ORYONP9]> {
+ let BufferSize = 64;
+}
+
+// Store instructions on combo of STA/STD pipes
+def ORYONST : ProcResGroup<[ORYONST0, ORYONST1]> {
+ let BufferSize = 64;
+}
+
+// Arithmetic and CRYP-AED ASIMD/FP instructions on ports FP0/FP1/FP2/FP3.
+def ORYONFP0123 : ProcResGroup<[ORYONP12FP0, ORYONP13FP1,
+ ORYONP14FP2, ORYONP15FP3]> {
+ let BufferSize = 192;
+}
+
+// FP Comparison and F/I move instructions on ports FP0/FP1.
+def ORYONFP01 : ProcResGroup<[ORYONP12FP0, ORYONP13FP1]> {
+ let BufferSize = 96;
+}
+
+// FDIV instructions on ports FP3.
+def ORYONFP3 : ProcResGroup<[ORYONP15FP3]> {
+ let BufferSize = 48;
+}
+
+// CRYP-SHA instructions on ports FP1.
+def ORYONFP1 : ProcResGroup<[ORYONP14FP2]> {
+ let BufferSize = 48;
+}
+
+def ORYONFP2 : ProcResGroup<[ORYONP14FP2]> {
+ let BufferSize = 48;
+}
+
+// Reciprocal, Squre root on FP0.
+def ORYONFP0 : ProcResGroup<[ORYONP12FP0]> {
+ let BufferSize = 48;
+}
+
+// cross IXU/LSU/VXU resource group for FMOV P41 of VXU
+// I2V
+def ORYONI2V : ProcResGroup<[ORYONI4FP0, ORYONI5FP1]> {
+ let BufferSize = 40;
+}
+
+// V2I
+def ORYONV2I : ProcResGroup<[ORYONFP0I4, ORYONFP1I5]> {
+ let BufferSize = 96;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: ORYONWrite_<NumCycles>Cyc_<Resources>.
+
+// Because of the complexity of Oryon CPU, we skip the following
+// generic definitions and define each instruction specifically
+
+// These WriteRes entries are not used in the Falkor sched model.
+def : WriteRes<WriteImm, []> { let Unsupported = 1; }
+def : WriteRes<WriteI, []> { let Unsupported = 1; }
+def : WriteRes<WriteISReg, []> { let Unsupported = 1; }
+def : WriteRes<WriteIEReg, []> { let Unsupported = 1; }
+def : WriteRes<WriteExtr, []> { let Unsupported = 1; }
+def : WriteRes<WriteIS, []> { let Unsupported = 1; }
+def : WriteRes<WriteID32, []> { let Unsupported = 1; }
+def : WriteRes<WriteID64, []> { let Unsupported = 1; }
+def : WriteRes<WriteIM32, []> { let Unsupported = 1; }
+def : WriteRes<WriteIM64, []> { let Unsupported = 1; }
+def : WriteRes<WriteBr, []> { let Unsupported = 1; }
+def : WriteRes<WriteBrReg, []> { let Unsupported = 1; }
+def : WriteRes<WriteLD, []> { let Unsupported = 1; }
+def : WriteRes<WriteST, []> { let Unsupported = 1; }
+def : WriteRes<WriteSTP, []> { let Unsupported = 1; }
+def : WriteRes<WriteAdr, []> { let Unsupported = 1; }
+def : WriteRes<WriteLDIdx, []> { let Unsupported = 1; }
+def : WriteRes<WriteSTIdx, []> { let Unsupported = 1; }
+def : WriteRes<WriteF, []> { let Unsupported = 1; }
+def : WriteRes<WriteFCmp, []> { let Unsupported = 1; }
+def : WriteRes<WriteFCvt, []> { let Unsupported = 1; }
+def : WriteRes<WriteFCopy, []> { let Unsupported = 1; }
+def : WriteRes<WriteFImm, []> { let Unsupported = 1; }
+def : WriteRes<WriteFMul, []> { let Unsupported = 1; }
+def : WriteRes<WriteFDiv, []> { let Unsupported = 1; }
+def : WriteRes<WriteVd, []> { let Unsupported = 1; }
+def : WriteRes<WriteVq, []> { let Unsupported = 1; }
+def : WriteRes<WriteVLD, []> { let Unsupported = 1; }
+def : WriteRes<WriteVST, []> { let Unsupported = 1; }
+def : WriteRes<WriteSys, []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Unsupported = 1; }
+def : WriteRes<WriteHint, []> { let Unsupported = 1; }
+def : WriteRes<WriteLDHi, []> { let Unsupported = 1; }
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// These ReadAdvance entries will be defined in later implementation
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+def : ReadAdvance<ReadST, 0>;
+
+
+//IXU resource definition
+// 1 cycles NO pipe
+def ORYONWrite_1Cyc_NONE : SchedWriteRes<[]>;
+
+// 1 cycles on I01.
+def ORYONWrite_1Cyc_I01 : SchedWriteRes<[ORYONI01]>;
+
+def ORYONWrite_1Cyc_2Uops_I01 : SchedWriteRes<[ORYONI01]> {
+ let NumMicroOps = 2;
+}
+
+def ORYONWrite_1Cyc_I0 : SchedWriteRes<[ORYONI0]>;
+
+// 7 cycles on I2. PAC*/AUT* instructions
+def ORYONWrite_7Cyc_I2 : SchedWriteRes<[ORYONI2]> {
+ let Latency = 7;
+}
+
+// 7 cycles on I2. PAC*/AUT* instructions
+def ORYONWrite_7Cyc_3Uops_I2 : SchedWriteRes<[ORYONI2]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+// 9 (7+1+1) cycles on I2 and I0/I1, I0. Authentication branch instructions
+// these instructions are broken down to three uops
+// a. PtrAuth on pipe 2 taking 7 cycles
+// b. Link Register Update on pipes 0 and 1 taking 1 cycle
+// c. Indirect branch on pipe 0 taking 1 cycle
+
+def ORYONWrite_9Cyc_I012 : SchedWriteRes<[ORYONI2, ORYONI01]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+}
+
+// 3 cycles on I2. CRC32 and CRC32C instructions
+def ORYONWrite_3Cyc_I2 : SchedWriteRes<[ORYONI2]> {
+ let Latency = 3;
+}
+
+// 1 cycle on I012345
+def ORYONWrite_1Cyc_I012345 : SchedWriteRes<[ORYONI012345]>;
+
+// 1 cycle on I0123
+def ORYONWrite_1Cyc_I0123 : SchedWriteRes<[ORYONI0123]>;
+
+// 1 cycle on 2 of I012345
+def ORYONWrite_1Cyc_I012345_I012345 :
+SchedWriteRes<[ORYONI012345, ORYONI012345]> ;
+
+// 2 cycle on 2 of I0123 with ReleaseAtCycles
+def ORYONWrite_2Cyc_I0123_I0123_RC :
+SchedWriteRes<[ORYONI0123, ORYONI0123]> {
+ let Latency = 2;
+ let ReleaseAtCycles = [2,2];
+}
+
+// 2 cycle on 2 of I012345
+def ORYONWrite_2Cyc_I012345_I012345_RC :
+SchedWriteRes<[ORYONI012345, ORYONI012345]> {
+ let Latency = 2;
+ let ReleaseAtCycles = [2,2];
+}
+
+// 3 cycle on 2 of I45
+def ORYONWrite_3Cyc_I45_I45_RC :
+SchedWriteRes<[ORYONI45, ORYONI45]> {
+ let Latency = 3;
+ let ReleaseAtCycles = [2,2];
+}
+
+// 3 cycle on I45
+def ORYONWrite_3Cyc_I45 : SchedWriteRes<[ORYONI45]> {
+ let Latency = 3;
+}
+
+// 7 cycle on I2 32-bit integer division
+def ORYONWrite_7Cyc_I2_RC : SchedWriteRes<[ORYONI2]> {
+ let Latency = 7;
+ let ReleaseAtCycles = [2];
+}
+
+// 9 cycle on I2 64-bit integer division
+def ORYONWrite_9Cyc_I2_RC : SchedWriteRes<[ORYONI2]> {
+ let Latency = 9;
+ let ReleaseAtCycles = [2];
+}
+
+// LSU resource definition
+// need to define WriteLDAdr, WriteAdrAdr, WriteLDHi, WriteSTX
+// 4 cycle on LS(P6789)
+def ORYONWrite_4Cyc_LD : SchedWriteRes<[ORYONLD]> {
+ let Latency = 4;
+}
+
+// 4 cycle for Post/Pre inc/dec access, also covers all pair loads Post/Pre
+def ORYONWrite_4Cyc_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+ let Latency = 4;
+}
+
+// 5 (4+1) for VXU SIMD access/could also include FP
+// resource might not be correct, as VXU resource not included
+def ORYONWrite_5Cyc_LD : SchedWriteRes<[ORYONLD]> {
+ let Latency = 5;
+}
+
+def ORYONWrite_5Cyc_2Uops_LD : SchedWriteRes<[ORYONLD]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def ORYONWrite_5Cyc_3Uops_LD : SchedWriteRes<[ORYONLD]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+def ORYONWrite_5Cyc_4Uops_LD : SchedWriteRes<[ORYONLD]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+
+def ORYONWrite_5Cyc_5Uops_LD : SchedWriteRes<[ORYONLD]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+}
+
+def ORYONWrite_5Cyc_6Uops_LD : SchedWriteRes<[ORYONLD]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+}
+
+def ORYONWrite_5Cyc_8Uops_LD : SchedWriteRes<[ORYONLD]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+}
+
+def ORYONWrite_5Cyc_10Uops_LD : SchedWriteRes<[ORYONLD]> {
+ let Latency = 5;
+ let NumMicroOps = 10;
+}
+
+// 6 cycle for Post/Pre inc/dec access
+def ORYONWrite_5Cyc_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+ let Latency = 5;
+}
+
+def ORYONWrite_5Cyc_2Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def ORYONWrite_5Cyc_3Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+def ORYONWrite_5Cyc_4Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+
+def ORYONWrite_5Cyc_5Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+}
+
+def ORYONWrite_5Cyc_6Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+}
+
+def ORYONWrite_5Cyc_8Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+}
+
+def ORYONWrite_5Cyc_10Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+ let Latency = 5;
+ let NumMicroOps = 10;
+}
+
+// 1 cycle for all generic stores
+def ORYONWrite_1Cyc_ST : SchedWriteRes<[ORYONST]>;
+
+def ORYONWrite_1Cyc_2Uops_ST : SchedWriteRes<[ORYONST]> {
+ let NumMicroOps = 2;
+}
+
+def ORYONWrite_1Cyc_3Uops_ST : SchedWriteRes<[ORYONST]> {
+ let NumMicroOps = 3;
+}
+
+def ORYONWrite_1Cyc_4Uops_ST : SchedWriteRes<[ORYONST]> {
+ let NumMicroOps = 4;
+}
+
+def ORYONWrite_1Cyc_5Uops_ST : SchedWriteRes<[ORYONST]> {
+ let NumMicroOps = 5;
+}
+
+def ORYONWrite_1Cyc_6Uops_ST : SchedWriteRes<[ORYONST]> {
+ let NumMicroOps = 6;
+}
+
+def ORYONWrite_1Cyc_8Uops_ST : SchedWriteRes<[ORYONST]> {
+ let NumMicroOps = 8;
+}
+
+def ORYONWrite_1Cyc_10Uops_ST : SchedWriteRes<[ORYONST]> {
+ let NumMicroOps = 10;
+}
+
+// 1 cycle for neon write: float + ASIMD with Post/Pre Inc/Dec access
+// also includes Pair store until further informed
+def ORYONWrite_1Cyc_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+ let NumMicroOps = 3;
+}
+
+def ORYONWrite_1Cyc_2Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+ let NumMicroOps = 2;
+}
+
+def ORYONWrite_1Cyc_3Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+ let NumMicroOps = 3;
+}
+
+def ORYONWrite_1Cyc_4Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+ let NumMicroOps = 4;
+}
+
+def ORYONWrite_1Cyc_5Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+ let NumMicroOps = 5;
+}
+
+def ORYONWrite_1Cyc_6Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+ let NumMicroOps = 6;
+}
+
+def ORYONWrite_1Cyc_8Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+ let NumMicroOps = 8;
+}
+
+def ORYONWrite_1Cyc_10Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+ let NumMicroOps = 10;
+}
+
+// VXU resource definition
+
+// I2V instruction has 1 uOp
+// I2v with convert has 2 uOps
+// all I2V, V2I's throughputs are 2
+// On VXU doc, p37 -- latencies and throughput
+// P41, resource taken, P42, uOps
+def ORYONWrite_I2V_4Cyc_I45 : SchedWriteRes<[ORYONI2V]> {
+ let Latency = 4;
+}
+
+// inline a FCVT, so add one more uOp
+def ORYONWrite_I2V_7Cyc_I45 : SchedWriteRes<[ORYONI2V]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+// V2I move instruction has 1/2 uOps, P42 in VXU doc
+// Latency is 3, FCVT is also 3 cycle
+// move + convert is 6 (3+3) cycles
+// throughput is 2
+def ORYONWrite_V2I_3Cyc_FP01 : SchedWriteRes<[ORYONV2I]> {
+ let Latency = 3;
+}
+
+// inline a FCVT, so add one more uOp
+def ORYONWrite_V2I_6Cyc_FP01 : SchedWriteRes<[ORYONV2I]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def ORYONWrite_V2V_2Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+ let Latency = 2;
+}
+
+def ORYONWrite_V2V_3Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+ let Latency = 3;
+}
+
+def ORYONWrite_V2V_6Cyc_FP01 : SchedWriteRes<[ORYONFP0123]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def ORYONWrite_4Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+ let Latency = 4;
+}
+
+def ORYONWrite_3Cyc_FP0 : SchedWriteRes<[ORYONFP0]> {
+ let Latency = 3;
+}
+
+def ORYONWrite_3Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+ let Latency = 3;
+}
+
+def ORYONWrite_3Cyc_2Uops_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def ORYONWrite_2Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+ let Latency = 2;
+}
+
+def ORYONWrite_2Cyc_FP01 : SchedWriteRes<[ORYONFP01]> {
+ let Latency = 2;
+}
+
+// 2 cycle on FP1
+def ORYONWrite_2Cyc_FP1 : SchedWriteRes<[ORYONFP1]> {
+ let Latency = 2;
+}
+
+// 3 cycle on FP1
+def ORYONWrite_3Cyc_FP1 : SchedWriteRes<[ORYONFP1]> {
+ let Latency = 3;
+}
+
+// 4 cycle , 0.5 throughput on FP1
+def ORYONWrite_4Cyc_FP1_RC4 : SchedWriteRes<[ORYONFP1]> {
+ let Latency = 4;
+ let ReleaseAtCycles = [4];
+}
+
+// 5 cycle , 1 throughput on FP1
+def ORYONWrite_5Cyc_FP1 : SchedWriteRes<[ORYONFP1]> {
+ let Latency = 5;
+}
+
+// 8 cycle , 2 throughput on FP0123
+def ORYONWrite_8Cyc_FP0123_RC : SchedWriteRes<[ORYONFP0123]> {
+ let Latency = 8;
+ let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_6Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+ let Latency = 6;
+}
+
+def ORYONWrite_7Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+ let Latency = 7;
+}
+
+def ORYONWrite_8Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+ let Latency = 8;
+}
+
+def ORYONWrite_9Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+ let Latency = 9;
+}
+
+def ORYONWrite_10Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+ let Latency = 10;
+}
+
+def ORYONWrite_8Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> {
+ let Latency = 8;
+ let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_10Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> {
+ let Latency = 10;
+ let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_13Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> {
+ let Latency = 13;
+ let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_4Cyc_FP0123_RC :
+SchedWriteRes<[ORYONFP0123]> {
+ let Latency = 4;
+ let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_4Cyc_FP0123_FP0123_RC :
+SchedWriteRes<[ORYONFP0123, ORYONFP0123]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ReleaseAtCycles = [2,2];
+}
+
+def ORYONWrite_4Cyc_FP0123_FP0123_FP0123_RC :
+SchedWriteRes<[ORYONFP0123, ORYONFP0123, ORYONFP0123]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ReleaseAtCycles = [3,3,3];
+}
+
+def ORYONWrite_6Cyc_FP0123_FP0123_FP0123_FP0123_RC :
+SchedWriteRes<[ORYONFP0123, ORYONFP0123, ORYONFP0123, ORYONFP0123]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ReleaseAtCycles = [6,6,6,6];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Tables in IXU
+//===----------------------------------------------------------------------===//
+
+//---
+// Arithmetic Instructions
+//---
+
+//1, 1, 6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+ (instregex "^ADD(W|X)r(i|r|x)", "^SUB(W|X)r(i|r|x)")>;
+
+//2,2,3
+def : InstRW<[ORYONWrite_2Cyc_I012345_I012345_RC],
+ (instregex "^ADD(W|X)rs", "^SUB(W|X)rs")>;
+
+//1,1,4 alias CMP, CMN on page 75
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+ (instregex "^ADDS(W|X)r(i|r|x)(64)?", "^SUBS(W|X)r(i|r|x)")>;
+
+//2,2,2 alias CMP, CMN on page 75
+def : InstRW<[ORYONWrite_2Cyc_I0123_I0123_RC],
+ (instregex "^ADDS(W|X)rs", "^SUBS(W|X)rs")>;
+
+//1,1,4
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+ (instregex "^ADC(W|X)r","^SBC(W|X)r",
+ "^ADCS(W|X)r","^SBCS(W|X)r")>;
+
+//1,1,2
+def : InstRW<[ORYONWrite_1Cyc_2Uops_I01],
+ (instrs ADR,ADRP)>;
+
+//1,1,4
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+ (instregex "^CSEL(W|X)r", "^CSINV(W|X)r",
+ "^CSNEG(W|X)r", "^CSINC(W|X)r")>;
+
+//---
+//Compare Instruciton
+//---
+
+// We have CCMP, CCMN as LLVM DAG node
+// CMP is an alias of SUBS as above
+// CMN is an alias of ADDS as above
+// We also have no way to get shift compare node in LLVM
+//2,2,1.5 CMP, CMN
+
+//1,1,4
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+ (instregex "^CCMP(W|X)(i|r)", "^CCMN(W|X)(i|r)")>;
+
+//---
+// Branch
+//---
+
+def : InstRW<[ORYONWrite_1Cyc_NONE], (instrs B)>;
+def : InstRW<[ORYONWrite_1Cyc_I01], (instrs BL)>;
+def : InstRW<[ORYONWrite_1Cyc_I01],
+ (instrs Bcc, CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+def : InstRW<[ORYONWrite_1Cyc_I0], (instrs BR, BLR)>;
+def : InstRW<[ORYONWrite_1Cyc_I0], (instrs RET)>;
+
+// 3 uOp, 1 cycle for branch, 7 cycle for Authentication,
+// 1 cycle for updating link register
+// V8.3a PAC
+def : InstRW<[ORYONWrite_9Cyc_I012],
+ (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ,
+ BRAA, BRAAZ, BRAB, BRABZ)>;
+def : InstRW<[ORYONWrite_9Cyc_I012], (instrs RETAA, RETAB, ERETAA, ERETAB)>;
+
+def : InstRW<[ORYONWrite_7Cyc_3Uops_I2], (instregex "^LDRAA", "^LDRAB")>;
+
+// Logical Instructions
+//---
+
+//1,1,4 TST is an alias of ANDS
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+ (instregex "^ANDS(W|X)r(i|r|x)", "^BICS(W|X)r(i|r|x)")>;
+
+//2,2,2 TST shift is an alias
+def : InstRW<[ORYONWrite_2Cyc_I0123_I0123_RC],
+ (instregex "^ANDS(W|X)rs", "^BICS(W|X)rs")>;
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+ (instregex "^AND(W|X)r(i|r|x)", "^EOR(W|X)r(i|r|x)",
+ "^ORR(W|X)r(i|r|x)", "^BIC(W|X)r(i|r|x)",
+ "^EON(W|X)r(i|r|x)", "^ORN(W|X)r(i|r|x)")>;
+
+//2,2,3
+def : InstRW<[ORYONWrite_2Cyc_I012345_I012345_RC],
+ (instregex "^AND(W|X)rs", "^EOR(W|X)rs", "^ORR(W|X)rs",
+ "^BIC(W|X)rs", "^EON(W|X)rs", "^ORN(W|X)rs")>;
+
+
+//---
+// Shift Instructions
+//---
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+ (instregex "^ASRV(W|X)r", "^LSLV(W|X)r",
+ "^LSRV(W|X)r", "^RORV(W|X)r",
+ "RMIF")>;
+
+//---
+// Move-Data Bit-field and Sign_Extension Instructions
+//---
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+ (instregex "^MOVK(W|X)i", "^MOVN(W|X)i",
+ "^MOVZ(W|X)i", "^SBFM(W|X)ri",
+ "^UBFM(W|X)ri", "^BFM(W|X)ri",
+ "^SXT(W|B|H|X)", "^UXT(H|B)")>;
+
+// COPY instruction is an LLVM internal DAG node, needs further study
+def : InstRW<[ORYONWrite_1Cyc_I012345], (instrs COPY)>;
+
+//---
+// Reverse Instructions
+//---
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+ (instregex "^RBIT(W|X)r", "^REV(16|32|64)?(W|X)r")>;
+
+
+//---
+// Flag Manipulate Instructions
+//---
+
+//1,1,4
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+ (instregex "^SETF8", "^SETF16", "^CFINV")>;
+
+//---
+// Miscellaneous Instructions
+//---
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+ (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$", "^EXTR(W|X)rri")>;
+
+
+//---
+// Multiply Instructions
+//---
+
+//1,3,2
+def : InstRW<[ORYONWrite_3Cyc_I45],
+ (instregex "^MADD(W|X)rrr", "^MSUB(W|X)rrr",
+ "^(S|U)MADDLrrr", "^(S|U)MSUBLrrr",
+ "^(S|U)MULHrr")>;
+
+//---
+// Divide Instructions
+//---
+
+def : InstRW<[ORYONWrite_7Cyc_I2_RC],
+ (instregex "^(S|U)DIVWr")>;
+
+def : InstRW<[ORYONWrite_9Cyc_I2_RC],
+ (instregex "^(S|U)DIVXr")>;
+
+
+//---
+// Cryptgraphy Instructions
+//
+//1,3,1 on I2
+def : InstRW<[ORYONWrite_3Cyc_I2],
+ (instregex "^CRC32(B|H|W|X)rr", "^CRC32C(B|H|W|X)rr")>;
+
+//---
+// PAU instructions
+//---
+
+// on p47 of IXU document, we have 7 cycles for all PAU instructions
+// here we just assume all signing and pauth instructions are 7 cycles
+// assume all are 7 cycles here
+
+// signing instrucitons
+def : InstRW<[ORYONWrite_7Cyc_I2], (instrs PACIA, PACIB,
+ PACDA, PACDB,
+ PACIZA, PACIZB,
+ PACDZA, PACDZB,
+ PACGA)>;
+// authentication instrucitons
+def : InstRW<[ORYONWrite_7Cyc_I2], (instrs AUTIA, AUTIB,
+ AUTDA, AUTDB,
+ AUTIZA, AUTIZB,
+ AUTDZA, AUTDZB)>;
+def : InstRW<[ORYONWrite_7Cyc_I2], (instrs XPACI, XPACD)>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Tables in LSU
+//===----------------------------------------------------------------------===//
+
+// 4 cycle Load-to-use from L1D$
+// Neon load with 5 cycle
+// 6 cycle to STA ?
+// STD cycle ?
+// NEON STD + 2
+
+// Load Instructions
+// FP Load Instructions
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPDi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPQi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPSi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPXi)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPDi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPQi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPSi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPSWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPXi)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBui)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDui)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHui)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQui)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSui)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDl)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQl)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWl)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXl)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRBi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRHi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRXi)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSBWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSBXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSHWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSHXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSWi)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+ (instrs LDPDpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+ (instrs LDPQpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+ (instrs LDPSpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+ (instrs LDPWpre)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRDpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRQpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRWpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRXpre)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBWpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBXpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBWpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBXpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHWpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHXpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHWpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHXpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBBpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBBpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHHpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHHpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+ (instrs LDPDpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+ (instrs LDPQpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+ (instrs LDPSpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+ (instrs LDPWpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+ (instrs LDPXpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRDpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRQpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRWpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRXpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHHroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHWroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHXroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXroW)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHHroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHWroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHXroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXroX)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURBi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURBBi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURDi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURHi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURHHi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURQi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSBWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSBXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSHWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSHXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSWi)>;
+
+
+
+// Store register, immed post-index
+// NOTE: Handled by WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteST
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-indx, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteSTP.
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURBi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURBBi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURDi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURHi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURHHi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURQi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURSi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURWi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURXi)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRBi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRHi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRWi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRXi)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPDi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPQi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPXi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPWi)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPDi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPQi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPXi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPWi)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRBui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRDui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRHui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRQui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRXui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRWui)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STPXpre, STPXpost)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instrs STRXpre, STRXpost)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST],
+ (instrs STRBroW, STRBroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+ (instrs STRDroW, STRDroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+ (instrs STRHroW, STRHroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+ (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+ (instrs STRQroW, STRQroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+ (instrs STRSroW, STRSroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+ (instrs STRWroW, STRWroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+ (instrs STRXroW, STRXroX)>;
+
+// ASIMD Load instructions, 4 cycle access + 2 cycle NEON access
+// ASIMD load, 1 element, multiple, 1 reg, D-form 1uOps
+// ASIMD load, 1 element, multiple, 1 reg, Q-form 1uOps
+def : InstRW<[ORYONWrite_5Cyc_LD],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_LD_I012345],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form 3 uOps
+// ASIMD load, 1 element, multiple, 2 reg, Q-form 2 uOps
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD],
+ (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD],
+ (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+ (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345],
+ (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form 4 uOps
+// ASIMD load, 1 element, multiple, 3 reg, Q-form 3 uOps
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD],
+ (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD],
+ (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+ (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+ (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form 6 uOps
+// ASIMD load, 1 element, multiple, 4 reg, Q-form 4 uOps
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD],
+ (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD],
+ (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+ (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+ (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S 2uOps
+// ASIMD load, 1 element, one lane, D 2UOps
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345],
+ (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S 2uOps
+// ASIMD load, 1 element, all lanes, D-form, D 2uOps
+// ASIMD load, 1 element, all lanes, Q-form 2uOps
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S 3 uOps
+// ASIMD load, 2 element, multiple, Q-form, D 4 uOps
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD],
+ (instregex "^LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD],
+ (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+ (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+ (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H 3 uOps
+// ASIMD load, 2 element, one lane, S 3 uOps
+// ASIMD load, 2 element, one lane, D 3 uOps
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+ (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S 3 uOps
+// ASIMD load, 2 element, all lanes, D-form, D 3 uOps
+// ASIMD load, 2 element, all lanes, Q-form 3 uOps
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S 5 uOps
+// ASIMD load, 3 element, multiple, Q-form, B/H/S 6 uOps
+// ASIMD load, 3 element, multiple, Q-form, D 6 uOps
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD],
+ (instregex "^LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD],
+ (instregex "^LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+ (instregex "^LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+ (instregex "^LD3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H 4 uOps
+// ASIMD load, 3 element, one lane, S 4 uOps
+// ASIMD load, 3 element, one lane, D 5 uOps
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], (instregex "^LD3i(8|16|32)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], (instregex "^LD3i(64)$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+ (instregex "^LD3i(8|16|32)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+ (instregex "^LD3i(64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S 4 uOps
+// ASIMD load, 3 element, all lanes, D-form, D 5 uOps
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S 4 uOps
+// ASIMD load, 3 element, all lanes, Q-form, D 5 uOps
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD],
+ (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD],
+ (instregex "^LD3Rv(1d|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+ (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+ (instregex "^LD3Rv(1d|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S 6 uOps
+// ASIMD load, 4 element, multiple, Q-form, B/H/S 10 uOps
+// ASIMD load, 4 element, multiple, Q-form, D 8 uOps
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD],
+ (instregex "^LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_10Uops_LD],
+ (instregex "^LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_8Uops_LD],
+ (instregex "^LD4Fourv(2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+ (instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_10Uops_LD_I012345],
+ (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_8Uops_LD_I012345],
+ (instregex "^LD4Fourv(2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H 5 uOps
+// ASIMD load, 4 element, one lane, S 5 uOps
+// ASIMD load, 4 element, one lane, D 6 uOps
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], (instregex "^LD4i(64)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+ (instregex "^LD4i(8|16|32)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+ (instregex "^LD4i(64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S 5 uOps
+// ASIMD load, 4 element, all lanes, D-form, D 6 uOps
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S 5 uOps
+// ASIMD load, 4 element, all lanes, Q-form, D 6 uOps
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD],
+ (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD],
+ (instregex "^LD4Rv(1d|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+ (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+ (instregex "^LD4Rv(1d|2d)_POST$")>;
+
+// ASIMD Store Instructions
+// ASIMD store, 1 element, multiple, 1 reg, D-form 1 uOps
+// ASIMD store, 1 element, multiple, 1 reg, Q-form 1 uops
+def : InstRW<[ORYONWrite_1Cyc_ST],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form 2 uOps
+// ASIMD store, 1 element, multiple, 2 reg, Q-form 2 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form 3 uOps
+// ASIMD store, 1 element, multiple, 3 reg, Q-form 3 uOps
+def : InstRW<[ORYONWrite_1Cyc_3Uops_ST],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_3Uops_ST_I012345],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form 4 uOps
+// ASIMD store, 1 element, multiple, 4 reg, Q-form 4 uOps
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S 2 uOps
+// ASIMD store, 1 element, one lane, D 2 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST],
+ (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+ (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S 2 uOps
+// ASIMD store, 2 element, multiple, Q-form, B/H/S 4 uOps
+// ASIMD store, 2 element, multiple, Q-form, D 4 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST],
+ (instregex "^ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST],
+ (instregex "^ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+ (instregex "^ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+ (instregex "^ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S 2 uOps
+// ASIMD store, 2 element, one lane, D 2 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST],
+ (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+ (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S 4 uOps
+// ASIMD store, 3 element, multiple, Q-form, B/H/S 6 uOps
+// ASIMD store, 3 element, multiple, Q-form, D 6 uOps
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST],
+ (instregex "^ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_1Cyc_6Uops_ST],
+ (instregex "^ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+ (instregex "^ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_6Uops_ST_I012345],
+ (instregex "^ST3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H 2 uOps
+// ASIMD store, 3 element, one lane, S 2 uOps
+// ASIMD store, 3 element, one lane, D 4 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], (instregex "^ST3i(8|16|32)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], (instregex "^ST3i(64)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+ (instregex "^ST3i(8|16|32)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+ (instregex "^ST3i(64)_POST$")>;
+
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S 5 uOps
+// ASIMD store, 4 element, multiple, Q-form, B/H/S 10 uOps
+// ASIMD store, 4 element, multiple, Q-form, D 8 uOps
+def : InstRW<[ORYONWrite_1Cyc_5Uops_ST],
+ (instregex "^ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_1Cyc_10Uops_ST],
+ (instregex "^ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[ORYONWrite_1Cyc_8Uops_ST],
+ (instregex "^ST4Fourv(2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_5Uops_ST_I012345],
+ (instregex "^ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_10Uops_ST_I012345],
+ (instregex "^ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_8Uops_ST_I012345],
+ (instregex "^ST4Fourv(2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H 3 uOps
+// ASIMD store, 4 element, one lane, S 3 uOps
+// ASIMD store, 4 element, one lane, D 4 uOps
+def : InstRW<[ORYONWrite_1Cyc_3Uops_ST], (instregex "^ST4i(8|16|32)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], (instregex "^ST4i(64)$")>;
+def : InstRW<[ORYONWrite_1Cyc_3Uops_ST_I012345],
+ (instregex "^ST4i(8|16|32)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+ (instregex "^ST4i(64)_POST$")>;
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Tables in VXU
+//===----------------------------------------------------------------------===//
+// all uOps are not clearly written in the VXU document
+
+// I2V
+def : InstRW<[ORYONWrite_I2V_4Cyc_I45], (instregex "^FMOV[HSD][WX]r", "^FMOVDXHighr")>;
+
+// I2V with convert
+def : InstRW<[ORYONWrite_I2V_7Cyc_I45], (instregex "^[SU]CVTF[SU][XW][HSD]ri")>;
+
+// V2I
+def : InstRW<[ORYONWrite_V2I_3Cyc_FP01], (instregex "^FMOV[WX][HSD]r", "FMOVXDHighr")>;
+
+// V2I with convert 2nd [SU] necessary?
+def : InstRW<[ORYONWrite_V2I_6Cyc_FP01], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
+
+// float to float move immediate, row 7 in big chart
+def : InstRW<[ORYONWrite_V2V_2Cyc_FP0123], (instregex "^FMOV[HSD]r")>;
+def : InstRW<[ORYONWrite_V2V_2Cyc_FP0123], (instregex "^FMOV[HSD]i")>;
+
+// float to float conversion within VXU, precision conversion
+def : InstRW<[ORYONWrite_V2V_6Cyc_FP01], (instregex "^FJCVTZS")>;
+def : InstRW<[ORYONWrite_V2V_3Cyc_FP0123], (instregex "^FCVT[HSD][HSD]r",
+ "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// floating comparison write to NZCV
+def : InstRW<[ORYONWrite_2Cyc_FP01], (instregex "^FCMP(E)?[HSD]r[ir]")>;
+def : InstRW<[ORYONWrite_2Cyc_FP01], (instregex "^FCCMP(E)?[HSD]rr")>;
+
+// floating point conditional select
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^FCSEL")>;
+
+// floating multiply-add
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^(F|FN)MADD", "^(F|FN)MSUB")>;
+
+// floating unary, cycle/throughput? xls row14
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^F(ABS|NEG)[SD]r")>;
+
+//floating division/square root
+def : InstRW<[ORYONWrite_7Cyc_FP3], (instregex "^FDIVHrr")>;
+def : InstRW<[ORYONWrite_8Cyc_FP3], (instregex "^FDIVSrr")>;
+def : InstRW<[ORYONWrite_10Cyc_FP3], (instregex "^FDIVDrr")>;
+
+def : InstRW<[ORYONWrite_8Cyc_FP3_RC], (instregex "^FSQRTHr")>;
+def : InstRW<[ORYONWrite_10Cyc_FP3_RC], (instregex "^FSQRTSr")>;
+def : InstRW<[ORYONWrite_13Cyc_FP3_RC], (instregex "^FSQRTDr")>;
+
+//==========
+// SIMD move instructions
+//==========
+
+// ASIMD DUP element
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^DUPv.+lane")>;
+// ASIMD DUP general thoughput undecided, 3? FP0123
+// VXU doc, p42, 2 uOps
+def : InstRW<[ORYONWrite_3Cyc_2Uops_FP0123], (instregex "^DUPv.+gpr")>;
+
+// ASIMD insert, element to element
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^INSv.+lane")>;
+// ASIMD insert, gen reg 3? FP0123?
+def : InstRW<[ORYONWrite_3Cyc_2Uops_FP0123], (instregex "^INSv.+gpr")>;
+
+// ASIMD move, FP immed
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^FMOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^[SU]MOVv")>;
+
+//==========
+// SIMD arithmetic instructions
+//==========
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDv", "^SUBv",
+ "^BIFv", "^BITv", "^BSLv",
+ "^ANDv", "^BICv", "^EORv",
+ "^ORRv", "^ORNv")>;
+
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// floating division
+def : InstRW<[ORYONWrite_6Cyc_FP3], (instregex "^FDIVv.*16$")>;
+def : InstRW<[ORYONWrite_7Cyc_FP3], (instregex "^FDIVv.*32$")>;
+def : InstRW<[ORYONWrite_9Cyc_FP3], (instregex "^FDIVv.*64$")>;
+
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FMUL(X)?v",
+ "^FRECPSv", "^FRSQRTSv")>;
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^MLAv","^MLSv", "^MULv",
+ "^PMULv", "UABAv")>;
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "SABAv", "SABDv",
+ "^(SH|UH)(ADD|SUB)v",
+ "^S(MAX|MIN)v",
+ "^(SQ|UQ)(ADD|SUB)v",
+ "^(SQ|SQR|UQ|UQR)SHLv",
+ "^(SR|UR)HADDv",
+ "^(SR|UR)SHLv",
+ "^UABDv",
+ "^U(MAX|MIN)v")>;
+// IMAX or UMAX in the above line
+//==========
+// SIMD compare instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^CMEQv","^CMGEv","^CMGTv",
+ "^CMLEv","^CMLTv", "^CMHIv",
+ "^CMHSv",
+ "^FCMEQv", "^FCMGEv",
+ "^FCMGTv", "^FCMLEv",
+ "^FCMLTv",
+ "^FACGEv", "^FACGTv")>;
+
+//==========
+// SIMD widening and narrowing arithmetic instructions
+//==========
+// NO need to list ADDHN2, RADDHN2, RSUBHN2 as they are not distinguished
+// from ADDHN, RADDHN, RSUBHN in td file(v16i8, v8i16, v4i32).
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDHNv",
+ "^SUBHNv",
+ "^RADDHNv",
+ "^RSUBHNv",
+ "^SABD(L|L2)v", "^UABD(L|L2)v",
+ "^(S|U)(ADD|SUB)(L|L2|W|W2)v")>;
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^PMUL(L|L2)v","^SABA(L|L2)v",
+ "^(S|U|SQ)(MLA|MSL|MUL)(L|L2)v")>;
+
+//==========
+// SIMD unary arithmetic instructions
+//==========
+//^MVNv is an alias of ^NOTv
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ABSv", "^CLSv","^CLZv", "^CNTv",
+ "^NEGv", "^NOTv",
+ "^RBITv", "^REV(16|32|64)v",
+ "^SQ(ABS|NEG)v", "^SQ(XT|XTU)(N|N2)v",
+ "^(SU|US)QADDv",
+ "^UQXT(N|N2)v", "^XTN2?v")>;
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^FCVT(L|L2|N|N2|XN|XN2)v",
+ "^FRINT[AIMNPXZ]v",
+ "^FRSQRTEv",
+ "^(S|U)ADALPv",
+ "^(S|U)ADDLPv")>;
+
+
+def : InstRW<[ORYONWrite_3Cyc_FP0], (instregex "^URECPEv", "^URSQRTEv",
+ "^FRECPEv", "^FRECPXv")>;
+
+def : InstRW<[ORYONWrite_8Cyc_FP3_RC], (instregex "^FSQRTv.*16$")>;
+def : InstRW<[ORYONWrite_10Cyc_FP3_RC], (instregex "^FSQRTv.*32$")>;
+def : InstRW<[ORYONWrite_13Cyc_FP3_RC], (instregex "^FSQRTv.*64$")>;
+
+//==========
+// SIMD binary elememt arithmetic instructions
+//==========
+
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FMLAv", "^FMLSv")>;
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^SQDMULHv",
+ "^SQRD(MLA|MLS|MUL)Hv")>;
+
+//==========
+// SIMD permute instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^EXTv", "^TRN(1|2)v",
+ "^UZP(1|2)v", "^ZIP(1|2)v")>;
+
+//==========
+// SIMD immediate instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^MOVIv", "^MVNIv")>;
+
+//==========
+// SIMD shift(immediate) instructions
+//==========
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^RSHR(N|N2)v", "^SHLv",
+ "^(SHL|SHR)(N|N2)v",
+ "^SLIv",
+ "^(SQ|SQR)SHR(U)?(N|N2)v",
+ "^(UQ|UQR)SHR(N|N2)v",
+ "^SQSHLUv",
+ "^SRIv",
+ "^(S|SR|U|UR)SHRv",
+ "^(S|SR|U|UR)SRAv",
+ "^(S|U)SHL(L|L2)v")>;
+
+//==========
+// SIMD floating-point and integer conversion instructions
+//==========
+// same as above conversion
+
+//==========
+// SIMD reduce (acoss vector lanes) instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDVv",
+ "^(FMAX|FMIN)(V|NMV)v",
+ "^(S|U)ADDLVv",
+ "^(S|U)(MAX|MIN)Vv")>;
+//==========
+// SIMD pairwise arithmetic instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDPv", "^FADDPv",
+ "^(FMAX|FMIN)(NMP|P)v",
+ "^(S|U)(MIN|MAX)Pv")>;
+//==========
+// SIMD dot prodcut instructions
+//==========
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^(U|S)DOTv")>;
+
+//==========
+// SIMD table lookup instructions
+//==========
+// TBL 1-reg/2-reg; TBX 1-reg, 1uOp, throughput=4 latency=2
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instrs TBLv8i8One, TBLv16i8One,
+ TBXv8i8One, TBXv16i8One,
+ TBLv8i8Two, TBLv16i8Two)>;
+
+// TBL 3-reg/4-reg, 3uops, throughtput=4/3=1.33 latency=4
+def : InstRW<[ORYONWrite_4Cyc_FP0123_FP0123_FP0123_RC],
+ (instrs TBLv8i8Three, TBLv16i8Three,
+ TBLv8i8Four, TBLv16i8Four)>;
+
+
+// TBX 2-reg 2 uOps, throughput=2 latency=4
+def : InstRW<[ORYONWrite_4Cyc_FP0123_FP0123_RC], (instrs TBXv8i8Two, TBXv16i8Two)>;
+
+// TBX 3-reg/4-reg, 4uOps, throughput=1, latency=6
+def : InstRW<[ORYONWrite_6Cyc_FP0123_FP0123_FP0123_FP0123_RC],
+ (instrs TBXv8i8Three, TBXv16i8Three,
+ TBXv8i8Four, TBXv16i8Four)>;
+
+
+//==========
+// SIMD complex number arithmetic instructions
+//==========
+
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FCADDv", "^FCMLAv")>;
+
+//==========
+// SIMD cryptographic instructions
+//==========
+// 3,4 on IMLA, CRYP
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^AES[DE]",
+ "^SM3(TT1|TT2)(A|B)")>;
+
+// 2,4 on CRYP
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^AESI?MC",
+ "^EOR3",
+ "^RAX1",
+ "^XAR",
+ "^BCAX",
+ "^SM3SS1",
+ "^SM3PART(W1|W2)")>;
+// 5,1 on CRYP
+def : InstRW<[ORYONWrite_5Cyc_FP1], (instregex "^SM4E",
+ "^SM4EKEY")>;
+
+// 2,1 on CRYP
+def : InstRW<[ORYONWrite_2Cyc_FP1], (instregex "^SHA1(H|SU0|SU1)",
+ "^SHA256SU0",
+ "^SHA512(SU0|SU1)")>;
+
+// 3,1 on CRYP
+def : InstRW<[ORYONWrite_3Cyc_FP1], (instregex "^SHA256SU1",
+ "^SHA512(H|H2)")>;
+
+// 4,0.25 on CRYP
+def : InstRW<[ORYONWrite_4Cyc_FP1_RC4], (instregex "^SHA1(C|P|M)",
+ "^SHA256(H|H2)")>;
+
+//==========
+// SIMD v8.6 instructions
+//==========
+// 4,2 on IMLA
+def : InstRW<[ORYONWrite_4Cyc_FP0123_RC], (instregex "^(S|U|US)MMLA$")>;
+
+// 4,0.5 on IMLA
+def : InstRW<[ORYONWrite_8Cyc_FP0123_RC], (instregex "^BFMMLA$")>;
+
+// 4,0.5 on IMLA
+def : InstRW<[ORYONWrite_8Cyc_FP0123_RC], (instregex "^BFMLAL(B|T)")>;
+
+// 3,4
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^(US|SU)DOTv")>;
+
+// 3,1
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^BF(16)?DOTv")>;
+
+// 3,4
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^BFCVT(N|N2)?$")>;
+
+
+} // SchedModel = OryonModel
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 8bc26eeef34d..93ea729e2550 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -299,6 +299,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
PrefLoopAlignment = Align(64);
MaxInterleaveFactor = 4;
break;
+ case Oryon:
+ CacheLineSize = 64;
+ PrefFunctionAlignment = Align(16);
+ MaxInterleaveFactor = 4;
+ PrefetchDistance = 128;
+ MinPrefetchStride = 1024;
+ break;
}
if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f49c73dc7951..9f5756fc7e40 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -58,6 +58,9 @@ static cl::opt<unsigned> InlineCallPenaltyChangeSM(
static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
+ cl::init(true), cl::Hidden);
+
namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
@@ -4216,3 +4219,19 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
return true;
return BaseT::shouldTreatInstructionLikeSelect(I);
}
+
+bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+ const TargetTransformInfo::LSRCost &C2) {
+ // AArch64 specific here is adding the number of instructions to the
+ // comparison (though not as the first consideration, as some targets do)
+ // along with changing the priority of the base additions.
+ // TODO: Maybe a more nuanced tradeoff between instruction count
+ // and number of registers? To be investigated at a later date.
+ if (EnableLSRCostOpt)
+ return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
+ C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+ std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
+ C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+
+ return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
+} \ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 2f44aaa3e26a..feec1a4289c3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -425,6 +425,9 @@ public:
}
std::optional<unsigned> getMinPageSize() const { return 4096; }
+
+ bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+ const TargetTransformInfo::LSRCost &C2);
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8e302786c746..d0d7a9dc1724 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1534,6 +1534,12 @@ def FeatureISAVersion11_5_1 : FeatureSet<
FeatureVGPRSingleUseHintInsts,
Feature1_5xVGPRs])>;
+def FeatureISAVersion11_5_2 : FeatureSet<
+ !listconcat(FeatureISAVersion11_Common.Features,
+ [FeatureSALUFloatInsts,
+ FeatureDPPSrc1SGPR,
+ FeatureVGPRSingleUseHintInsts])>;
+
def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 625ac0230f16..2bdbf4151dd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -1017,7 +1017,7 @@ public:
//
// TODO: We could filter out subgraphs that do not access LDS globals.
for (Function *F : KernelsThatAllocateTableLDS)
- removeFnAttrFromReachable(CG, F, "amdgpu-no-lds-kernel-id");
+ removeFnAttrFromReachable(CG, F, {"amdgpu-no-lds-kernel-id"});
}
DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9c94ca1e4708..17c961578382 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -57,6 +57,7 @@
#include "llvm/Transforms/HipStdPar/HipStdPar.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/IPO/Internalize.h"
#include "llvm/Transforms/Scalar.h"
@@ -992,6 +993,10 @@ void AMDGPUPassConfig::addIRPasses() {
if (isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+ // This can be disabled by passing ::Disable here or on the command line
+ // with --expand-variadics-override=disable.
+ addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
+
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 2ada981a77cd..d218ffeb1fec 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -295,7 +295,11 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel,
FeatureISAVersion11_5_1.Features
>;
-// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151]
+def : ProcessorModel<"gfx1152", GFX11SpeedModel,
+ FeatureISAVersion11_5_2.Features
+>;
+
+// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152]
def : ProcessorModel<"gfx11-generic", GFX11SpeedModel,
FeatureISAVersion11_Generic.Features
>;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index d7d6e00d2389..e805e964ffe4 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -113,6 +113,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: AK = GK_GFX1150; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: AK = GK_GFX1152; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: AK = GK_GFX9_GENERIC; break;
@@ -196,6 +197,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103;
case GK_GFX1150: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150;
case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151;
+ case GK_GFX1152: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152;
case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
case GK_GFX9_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index c47eea20563d..8b42d4a1dee7 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2052,9 +2052,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
MemInfoMap &Visited,
SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
- if (!(MI.mayLoad() ^ MI.mayStore()))
- return false;
-
if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
return false;
@@ -2065,10 +2062,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
: AMDGPUAS::FLAT_ADDRESS;
- if (MI.mayLoad() &&
- TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
- return false;
-
if (AnchorList.count(&MI))
return false;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f178324dbbe2..5dc3457b5bfa 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,8 +103,6 @@ private:
MachineBasicBlock *emitEndCf(MachineInstr &MI);
- void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
-
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;
@@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
return SplitBB;
}
-void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
- MachineInstr &MI) {
- MachineFunction &MF = *MBB->getParent();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- bool IsWave32 = ST.isWave32();
-
- if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
- // This should be before all vector instructions.
- MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
- TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
- .addImm(MI.getOperand(0).getImm());
- if (LIS) {
- LIS->RemoveMachineInstrFromMaps(MI);
- LIS->InsertMachineInstrInMaps(*InitMI);
- }
- MI.eraseFromParent();
- return;
- }
-
- // Extract the thread count from an SGPR input and set EXEC accordingly.
- // Since BFM can't shift by 64, handle that case with CMP + CMOV.
- //
- // S_BFE_U32 count, input, {shift, 7}
- // S_BFM_B64 exec, count, 0
- // S_CMP_EQ_U32 count, 64
- // S_CMOV_B64 exec, -1
- Register InputReg = MI.getOperand(0).getReg();
- MachineInstr *FirstMI = &*MBB->begin();
- if (InputReg.isVirtual()) {
- MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
- assert(DefInstr && DefInstr->isCopy());
- if (DefInstr->getParent() == MBB) {
- if (DefInstr != FirstMI) {
- // If the `InputReg` is defined in current block, we also need to
- // move that instruction to the beginning of the block.
- DefInstr->removeFromParent();
- MBB->insert(FirstMI, DefInstr);
- if (LIS)
- LIS->handleMove(*DefInstr);
- } else {
- // If first instruction is definition then move pointer after it.
- FirstMI = &*std::next(FirstMI->getIterator());
- }
- }
- }
-
- // Insert instruction sequence at block beginning (before vector operations).
- const DebugLoc DL = MI.getDebugLoc();
- const unsigned WavefrontSize = ST.getWavefrontSize();
- const unsigned Mask = (WavefrontSize << 1) - 1;
- Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
- .addReg(InputReg)
- .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
- if (LV)
- LV->recomputeForSingleDefVirtReg(InputReg);
- auto BfmMI =
- BuildMI(*MBB, FirstMI, DL,
- TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
- .addReg(CountReg)
- .addImm(0);
- auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
- .addReg(CountReg, RegState::Kill)
- .addImm(WavefrontSize);
- if (LV)
- LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
- auto CmovMI =
- BuildMI(*MBB, FirstMI, DL,
- TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
- Exec)
- .addImm(-1);
-
- if (!LIS) {
- MI.eraseFromParent();
- return;
- }
-
- LIS->RemoveMachineInstrFromMaps(MI);
- MI.eraseFromParent();
-
- LIS->InsertMachineInstrInMaps(*BfeMI);
- LIS->InsertMachineInstrInMaps(*BfmMI);
- LIS->InsertMachineInstrInMaps(*CmpMI);
- LIS->InsertMachineInstrInMaps(*CmovMI);
-
- RecomputeRegs.insert(InputReg);
- LIS->createAndComputeVirtRegInterval(CountReg);
-}
-
bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
for (auto &I : MBB.instrs()) {
if (!I.isDebugInstr() && !I.isUnconditionalBranch())
@@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
SplitMBB = process(MI);
Changed = true;
break;
-
- // FIXME: find a better place for this
- case AMDGPU::SI_INIT_EXEC:
- case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
- lowerInitExec(MBB, MI);
- if (LIS)
- LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
- Changed = true;
- break;
-
- default:
- break;
}
if (SplitMBB != MBB) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 09dc1c781e2f..5b4c44302fa6 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -177,6 +177,7 @@ private:
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
SmallVector<MachineInstr *, 4> KillInstrs;
+ SmallVector<MachineInstr *, 4> InitExecInstrs;
void printInfo();
@@ -223,6 +224,8 @@ private:
void lowerLiveMaskQueries();
void lowerCopyInstrs();
void lowerKillInstrs(bool IsWQM);
+ void lowerInitExec(MachineInstr &MI);
+ void lowerInitExecInstrs();
public:
static char ID;
@@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Opcode == AMDGPU::SI_DEMOTE_I1) {
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
+ } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
+ Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
+ InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are
@@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
}
}
+void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
+ MachineBasicBlock *MBB = MI.getParent();
+ bool IsWave32 = ST->isWave32();
+
+ if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
+ // This should be before all vector instructions.
+ MachineInstr *InitMI =
+ BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+ Exec)
+ .addImm(MI.getOperand(0).getImm());
+ if (LIS) {
+ LIS->RemoveMachineInstrFromMaps(MI);
+ LIS->InsertMachineInstrInMaps(*InitMI);
+ }
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Extract the thread count from an SGPR input and set EXEC accordingly.
+ // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+ //
+ // S_BFE_U32 count, input, {shift, 7}
+ // S_BFM_B64 exec, count, 0
+ // S_CMP_EQ_U32 count, 64
+ // S_CMOV_B64 exec, -1
+ Register InputReg = MI.getOperand(0).getReg();
+ MachineInstr *FirstMI = &*MBB->begin();
+ if (InputReg.isVirtual()) {
+ MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
+ assert(DefInstr && DefInstr->isCopy());
+ if (DefInstr->getParent() == MBB) {
+ if (DefInstr != FirstMI) {
+ // If the `InputReg` is defined in current block, we also need to
+ // move that instruction to the beginning of the block.
+ DefInstr->removeFromParent();
+ MBB->insert(FirstMI, DefInstr);
+ if (LIS)
+ LIS->handleMove(*DefInstr);
+ } else {
+ // If first instruction is definition then move pointer after it.
+ FirstMI = &*std::next(FirstMI->getIterator());
+ }
+ }
+ }
+
+ // Insert instruction sequence at block beginning (before vector operations).
+ const DebugLoc DL = MI.getDebugLoc();
+ const unsigned WavefrontSize = ST->getWavefrontSize();
+ const unsigned Mask = (WavefrontSize << 1) - 1;
+ Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
+ .addReg(InputReg)
+ .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+ auto BfmMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+ .addReg(CountReg)
+ .addImm(0);
+ auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addReg(CountReg, RegState::Kill)
+ .addImm(WavefrontSize);
+ auto CmovMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+ Exec)
+ .addImm(-1);
+
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+
+ LIS->InsertMachineInstrInMaps(*BfeMI);
+ LIS->InsertMachineInstrInMaps(*BfmMI);
+ LIS->InsertMachineInstrInMaps(*CmpMI);
+ LIS->InsertMachineInstrInMaps(*CmovMI);
+
+ LIS->removeInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(CountReg);
+}
+
+void SIWholeQuadMode::lowerInitExecInstrs() {
+ for (MachineInstr *MI : InitExecInstrs)
+ lowerInitExec(*MI);
+}
+
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
<< " ------------- \n");
@@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToCopyInstrs.clear();
LowerToMovInstrs.clear();
KillInstrs.clear();
+ InitExecInstrs.clear();
StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1606,10 +1704,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// Shader is simple does not need any state changes or any complex lowering
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
LowerToMovInstrs.empty() && KillInstrs.empty()) {
+ lowerInitExecInstrs();
lowerLiveMaskQueries();
- return !LiveMaskQueries.empty();
+ return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
}
+ lowerInitExecInstrs();
+
MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index 239e0ee70572..04c6e940e6ed 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -235,8 +235,9 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
}
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
- StringRef FnAttr) {
- KernelRoot->removeFnAttr(FnAttr);
+ ArrayRef<StringRef> FnAttrs) {
+ for (StringRef Attr : FnAttrs)
+ KernelRoot->removeFnAttr(Attr);
SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
SmallPtrSet<Function *, 8> Visited;
@@ -261,12 +262,15 @@ void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
Function *PotentialCallee =
ExternalCallRecord.second->getFunction();
assert(PotentialCallee);
- if (!isKernelLDS(PotentialCallee))
- PotentialCallee->removeFnAttr(FnAttr);
+ if (!isKernelLDS(PotentialCallee)) {
+ for (StringRef Attr : FnAttrs)
+ PotentialCallee->removeFnAttr(Attr);
+ }
}
}
} else {
- Callee->removeFnAttr(FnAttr);
+ for (StringRef Attr : FnAttrs)
+ Callee->removeFnAttr(Attr);
if (Visited.insert(Callee).second)
WorkList.push_back(Callee);
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index 4d3ad328e131..e1cd4d03052b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
@@ -54,7 +55,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M);
/// Strip FnAttr attribute from any functions where we may have
/// introduced its use.
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
- StringRef FnAttr);
+ ArrayRef<StringRef> FnAttrs);
/// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
/// if this is actually a memory update or an artificial clobber to facilitate
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index a46c383115e2..919828753f45 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -115,6 +115,12 @@ static bool shouldInspect(MachineInstr &MI) {
return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
}
+static bool isHorizontalReduction(const MachineInstr &MI) {
+ const MCInstrDesc &MCID = MI.getDesc();
+ uint64_t Flags = MCID.TSFlags;
+ return (Flags & ARMII::HorizontalReduction) != 0;
+}
+
namespace {
using InstSet = SmallPtrSetImpl<MachineInstr *>;
@@ -275,6 +281,16 @@ namespace {
if (VPT->getOpcode() == ARM::MVE_VPST)
return false;
+ // If the VPT block does not define something that is an "output", then
+ // the tail-predicated version will just perform a subset of the original
+ // vpt block, where the last lanes should not be used.
+ if (isVPTOpcode(VPT->getOpcode()) &&
+ all_of(Block.getInsts(), [](const MachineInstr *MI) {
+ return !MI->mayStore() && !MI->mayLoad() &&
+ !isHorizontalReduction(*MI) && !isVCTP(MI);
+ }))
+ return true;
+
auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
@@ -813,12 +829,6 @@ static bool producesDoubleWidthResult(const MachineInstr &MI) {
return (Flags & ARMII::DoubleWidthResult) != 0;
}
-static bool isHorizontalReduction(const MachineInstr &MI) {
- const MCInstrDesc &MCID = MI.getDesc();
- uint64_t Flags = MCID.TSFlags;
- return (Flags & ARMII::HorizontalReduction) != 0;
-}
-
// Can this instruction generate a non-zero result when given only zeroed
// operands? This allows us to know that, given operands with false bytes
// zeroed by masked loads, that the result will also contain zeros in those
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 51384f25d245..9d7e4636abac 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -171,6 +171,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
// Set operations for 'F' feature.
if (Subtarget.hasBasicF()) {
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
@@ -186,6 +188,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
setOperationAction(ISD::FPOW, MVT::f32, Expand);
setOperationAction(ISD::FREM, MVT::f32, Expand);
+ setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::FRINT, MVT::f32, Legal);
@@ -202,7 +206,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
// Set operations for 'D' feature.
if (Subtarget.hasBasicD()) {
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
@@ -219,6 +225,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
setOperationAction(ISD::FPOW, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
+ setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::FRINT, MVT::f64, Legal);
@@ -5004,6 +5012,10 @@ bool LoongArchTargetLowering::isSExtCheaperThanZExt(EVT SrcVT,
return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
}
+bool LoongArchTargetLowering::signExtendConstant(const ConstantInt *CI) const {
+ return Subtarget.is64Bit() && CI->getType()->isIntegerTy(32);
+}
+
bool LoongArchTargetLowering::hasAndNotCompare(SDValue Y) const {
// TODO: Support vectors.
if (Y.getValueType().isVector())
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index f274b1971fd2..9328831a17a3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -229,6 +229,7 @@ public:
bool isLegalAddImmediate(int64_t Imm) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
+ bool signExtendConstant(const ConstantInt *CI) const override;
bool hasAndNotCompare(SDValue Y) const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 83466d53f84d..c29c1b593321 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -46,7 +46,7 @@ static cl::opt<bool>
static std::string computeDataLayout(const Triple &TT) {
if (TT.isArch64Bit())
- return "e-m:e-p:64:64-i64:64-i128:128-n64-S128";
+ return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
assert(TT.isArch32Bit() && "only LA32 and LA64 are currently supported");
return "e-m:e-p:32:32-i64:64-n32-S128";
}
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 5eefab59a6ab..b0cb24c63c3c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -40,7 +40,7 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
ModulePass *createNVPTXAssignValidGlobalNamesPass();
ModulePass *createGenericToNVVMLegacyPass();
ModulePass *createNVPTXCtorDtorLoweringLegacyPass();
-FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
+FunctionPass *createNVVMIntrRangePass();
FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
MachineFunctionPass *createNVPTXPrologEpilogPass();
MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
@@ -53,12 +53,7 @@ MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
- NVVMIntrRangePass();
- NVVMIntrRangePass(unsigned SmVersion) : SmVersion(SmVersion) {}
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-
-private:
- unsigned SmVersion;
};
struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index f63697916d90..82770f866085 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -542,30 +542,24 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// If the NVVM IR has some of reqntid* specified, then output
// the reqntid directive, and set the unspecified ones to 1.
// If none of Reqntid* is specified, don't output reqntid directive.
- unsigned Reqntidx, Reqntidy, Reqntidz;
- Reqntidx = Reqntidy = Reqntidz = 1;
- bool ReqSpecified = false;
- ReqSpecified |= getReqNTIDx(F, Reqntidx);
- ReqSpecified |= getReqNTIDy(F, Reqntidy);
- ReqSpecified |= getReqNTIDz(F, Reqntidz);
+ std::optional<unsigned> Reqntidx = getReqNTIDx(F);
+ std::optional<unsigned> Reqntidy = getReqNTIDy(F);
+ std::optional<unsigned> Reqntidz = getReqNTIDz(F);
- if (ReqSpecified)
- O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz
- << "\n";
+ if (Reqntidx || Reqntidy || Reqntidz)
+ O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1)
+ << ", " << Reqntidz.value_or(1) << "\n";
// If the NVVM IR has some of maxntid* specified, then output
// the maxntid directive, and set the unspecified ones to 1.
// If none of maxntid* is specified, don't output maxntid directive.
- unsigned Maxntidx, Maxntidy, Maxntidz;
- Maxntidx = Maxntidy = Maxntidz = 1;
- bool MaxSpecified = false;
- MaxSpecified |= getMaxNTIDx(F, Maxntidx);
- MaxSpecified |= getMaxNTIDy(F, Maxntidy);
- MaxSpecified |= getMaxNTIDz(F, Maxntidz);
-
- if (MaxSpecified)
- O << ".maxntid " << Maxntidx << ", " << Maxntidy << ", " << Maxntidz
- << "\n";
+ std::optional<unsigned> Maxntidx = getMaxNTIDx(F);
+ std::optional<unsigned> Maxntidy = getMaxNTIDy(F);
+ std::optional<unsigned> Maxntidz = getMaxNTIDz(F);
+
+ if (Maxntidx || Maxntidy || Maxntidz)
+ O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1)
+ << ", " << Maxntidz.value_or(1) << "\n";
unsigned Mincta = 0;
if (getMinCTASm(F, Mincta))
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 4dc3cea4bd8e..b60a1d747af7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -233,9 +233,9 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(
[this](ModulePassManager &PM, OptimizationLevel Level) {
FunctionPassManager FPM;
FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
- // FIXME: NVVMIntrRangePass is causing numerical discrepancies,
- // investigate and re-enable.
- // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
+ // Note: NVVMIntrRangePass was causing numerical discrepancies at one
+ // point, if issues crop up, consider disabling.
+ FPM.addPass(NVVMIntrRangePass());
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
});
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 013afe916e86..3a536db1c972 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -128,6 +128,14 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
return true;
}
+static std::optional<unsigned>
+findOneNVVMAnnotation(const GlobalValue &GV, const std::string &PropName) {
+ unsigned RetVal;
+ if (findOneNVVMAnnotation(&GV, PropName, RetVal))
+ return RetVal;
+ return std::nullopt;
+}
+
bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
std::vector<unsigned> &retval) {
auto &AC = getAnnotationCache();
@@ -252,32 +260,57 @@ std::string getSamplerName(const Value &val) {
return std::string(val.getName());
}
-bool getMaxNTIDx(const Function &F, unsigned &x) {
- return findOneNVVMAnnotation(&F, "maxntidx", x);
+std::optional<unsigned> getMaxNTIDx(const Function &F) {
+ return findOneNVVMAnnotation(F, "maxntidx");
}
-bool getMaxNTIDy(const Function &F, unsigned &y) {
- return findOneNVVMAnnotation(&F, "maxntidy", y);
+std::optional<unsigned> getMaxNTIDy(const Function &F) {
+ return findOneNVVMAnnotation(F, "maxntidy");
}
-bool getMaxNTIDz(const Function &F, unsigned &z) {
- return findOneNVVMAnnotation(&F, "maxntidz", z);
+std::optional<unsigned> getMaxNTIDz(const Function &F) {
+ return findOneNVVMAnnotation(F, "maxntidz");
+}
+
+std::optional<unsigned> getMaxNTID(const Function &F) {
+ // Note: The semantics here are a bit strange. The PTX ISA states the
+ // following (11.4.2. Performance-Tuning Directives: .maxntid):
+ //
+ // Note that this directive guarantees that the total number of threads does
+ // not exceed the maximum, but does not guarantee that the limit in any
+ // particular dimension is not exceeded.
+ std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F);
+ std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F);
+ std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F);
+ if (MaxNTIDx || MaxNTIDy || MaxNTIDz)
+ return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1);
+ return std::nullopt;
}
bool getMaxClusterRank(const Function &F, unsigned &x) {
return findOneNVVMAnnotation(&F, "maxclusterrank", x);
}
-bool getReqNTIDx(const Function &F, unsigned &x) {
- return findOneNVVMAnnotation(&F, "reqntidx", x);
+std::optional<unsigned> getReqNTIDx(const Function &F) {
+ return findOneNVVMAnnotation(F, "reqntidx");
+}
+
+std::optional<unsigned> getReqNTIDy(const Function &F) {
+ return findOneNVVMAnnotation(F, "reqntidy");
}
-bool getReqNTIDy(const Function &F, unsigned &y) {
- return findOneNVVMAnnotation(&F, "reqntidy", y);
+std::optional<unsigned> getReqNTIDz(const Function &F) {
+ return findOneNVVMAnnotation(F, "reqntidz");
}
-bool getReqNTIDz(const Function &F, unsigned &z) {
- return findOneNVVMAnnotation(&F, "reqntidz", z);
+std::optional<unsigned> getReqNTID(const Function &F) {
+ // Note: The semantics here are a bit strange. See getMaxNTID.
+ std::optional<unsigned> ReqNTIDx = getReqNTIDx(F);
+ std::optional<unsigned> ReqNTIDy = getReqNTIDy(F);
+ std::optional<unsigned> ReqNTIDz = getReqNTIDz(F);
+ if (ReqNTIDx || ReqNTIDy || ReqNTIDz)
+ return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1);
+ return std::nullopt;
}
bool getMinCTASm(const Function &F, unsigned &x) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 2872db9fa213..e020bc0f02e9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -48,13 +48,15 @@ std::string getTextureName(const Value &);
std::string getSurfaceName(const Value &);
std::string getSamplerName(const Value &);
-bool getMaxNTIDx(const Function &, unsigned &);
-bool getMaxNTIDy(const Function &, unsigned &);
-bool getMaxNTIDz(const Function &, unsigned &);
-
-bool getReqNTIDx(const Function &, unsigned &);
-bool getReqNTIDy(const Function &, unsigned &);
-bool getReqNTIDz(const Function &, unsigned &);
+std::optional<unsigned> getMaxNTIDx(const Function &);
+std::optional<unsigned> getMaxNTIDy(const Function &);
+std::optional<unsigned> getMaxNTIDz(const Function &);
+std::optional<unsigned> getMaxNTID(const Function &F);
+
+std::optional<unsigned> getReqNTIDx(const Function &);
+std::optional<unsigned> getReqNTIDy(const Function &);
+std::optional<unsigned> getReqNTIDz(const Function &);
+std::optional<unsigned> getReqNTID(const Function &);
bool getMaxClusterRank(const Function &, unsigned &);
bool getMinCTASm(const Function &, unsigned &);
diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
index 5381646434eb..f9d21b38a7ec 100644
--- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -1,4 +1,4 @@
-//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===//
+//===- NVVMIntrRange.cpp - Set range attributes for NVVM intrinsics -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,19 +6,21 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass adds appropriate !range metadata for calls to NVVM
+// This pass adds appropriate range attributes for calls to NVVM
// intrinsics that return a limited range of values.
//
//===----------------------------------------------------------------------===//
#include "NVPTX.h"
-#include "llvm/IR/Constants.h"
+#include "NVPTXUtilities.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Support/CommandLine.h"
+#include <cstdint>
using namespace llvm;
@@ -26,31 +28,20 @@ using namespace llvm;
namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); }
-// Add !range metadata based on limits of given SM variant.
-static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20),
- cl::Hidden, cl::desc("SM variant"));
-
namespace {
class NVVMIntrRange : public FunctionPass {
- private:
- unsigned SmVersion;
-
- public:
- static char ID;
- NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {}
- NVVMIntrRange(unsigned int SmVersion)
- : FunctionPass(ID), SmVersion(SmVersion) {
+public:
+ static char ID;
+ NVVMIntrRange() : FunctionPass(ID) {
- initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
- }
+ initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
+ }
- bool runOnFunction(Function &) override;
+ bool runOnFunction(Function &) override;
};
-}
+} // namespace
-FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) {
- return new NVVMIntrRange(SmVersion);
-}
+FunctionPass *llvm::createNVVMIntrRangePass() { return new NVVMIntrRange(); }
char NVVMIntrRange::ID = 0;
INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range",
@@ -58,112 +49,110 @@ INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range",
// Adds the passed-in [Low,High) range information as metadata to the
// passed-in call instruction.
-static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) {
- // This call already has range metadata, nothing to do.
- if (C->getMetadata(LLVMContext::MD_range))
+static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) {
+ if (II->getMetadata(LLVMContext::MD_range))
return false;
- LLVMContext &Context = C->getParent()->getContext();
- IntegerType *Int32Ty = Type::getInt32Ty(Context);
- Metadata *LowAndHigh[] = {
- ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)),
- ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))};
- C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh));
+ const uint64_t BitWidth = II->getType()->getIntegerBitWidth();
+ ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High));
+
+ if (auto CurrentRange = II->getRange())
+ Range = Range.intersectWith(CurrentRange.value());
+
+ II->addRangeRetAttr(Range);
return true;
}
-static bool runNVVMIntrRange(Function &F, unsigned SmVersion) {
+static bool runNVVMIntrRange(Function &F) {
struct {
unsigned x, y, z;
} MaxBlockSize, MaxGridSize;
- MaxBlockSize.x = 1024;
- MaxBlockSize.y = 1024;
- MaxBlockSize.z = 64;
- MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
+ const unsigned MetadataNTID = getReqNTID(F).value_or(
+ getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
+
+ MaxBlockSize.x = std::min(1024u, MetadataNTID);
+ MaxBlockSize.y = std::min(1024u, MetadataNTID);
+ MaxBlockSize.z = std::min(64u, MetadataNTID);
+
+ MaxGridSize.x = 0x7fffffff;
MaxGridSize.y = 0xffff;
MaxGridSize.z = 0xffff;
// Go through the calls in this function.
bool Changed = false;
for (Instruction &I : instructions(F)) {
- CallInst *Call = dyn_cast<CallInst>(&I);
- if (!Call)
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+ if (!II)
continue;
- if (Function *Callee = Call->getCalledFunction()) {
- switch (Callee->getIntrinsicID()) {
- // Index within block
- case Intrinsic::nvvm_read_ptx_sreg_tid_x:
- Changed |= addRangeMetadata(0, MaxBlockSize.x, Call);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_tid_y:
- Changed |= addRangeMetadata(0, MaxBlockSize.y, Call);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_tid_z:
- Changed |= addRangeMetadata(0, MaxBlockSize.z, Call);
- break;
-
- // Block size
- case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
- Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
- Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
- Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call);
- break;
-
- // Index within grid
- case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
- Changed |= addRangeMetadata(0, MaxGridSize.x, Call);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
- Changed |= addRangeMetadata(0, MaxGridSize.y, Call);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
- Changed |= addRangeMetadata(0, MaxGridSize.z, Call);
- break;
-
- // Grid size
- case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
- Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
- Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call);
- break;
- case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
- Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call);
- break;
-
- // warp size is constant 32.
- case Intrinsic::nvvm_read_ptx_sreg_warpsize:
- Changed |= addRangeMetadata(32, 32+1, Call);
- break;
-
- // Lane ID is [0..warpsize)
- case Intrinsic::nvvm_read_ptx_sreg_laneid:
- Changed |= addRangeMetadata(0, 32, Call);
- break;
-
- default:
- break;
- }
+ switch (II->getIntrinsicID()) {
+ // Index within block
+ case Intrinsic::nvvm_read_ptx_sreg_tid_x:
+ Changed |= addRangeAttr(0, MaxBlockSize.x, II);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_tid_y:
+ Changed |= addRangeAttr(0, MaxBlockSize.y, II);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_tid_z:
+ Changed |= addRangeAttr(0, MaxBlockSize.z, II);
+ break;
+
+ // Block size
+ case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
+ Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
+ Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
+ Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II);
+ break;
+
+ // Index within grid
+ case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
+ Changed |= addRangeAttr(0, MaxGridSize.x, II);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
+ Changed |= addRangeAttr(0, MaxGridSize.y, II);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
+ Changed |= addRangeAttr(0, MaxGridSize.z, II);
+ break;
+
+ // Grid size
+ case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
+ Changed |= addRangeAttr(1, MaxGridSize.x + 1, II);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
+ Changed |= addRangeAttr(1, MaxGridSize.y + 1, II);
+ break;
+ case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
+ Changed |= addRangeAttr(1, MaxGridSize.z + 1, II);
+ break;
+
+ // warp size is constant 32.
+ case Intrinsic::nvvm_read_ptx_sreg_warpsize:
+ Changed |= addRangeAttr(32, 32 + 1, II);
+ break;
+
+ // Lane ID is [0..warpsize)
+ case Intrinsic::nvvm_read_ptx_sreg_laneid:
+ Changed |= addRangeAttr(0, 32, II);
+ break;
+
+ default:
+ break;
}
}
return Changed;
}
-bool NVVMIntrRange::runOnFunction(Function &F) {
- return runNVVMIntrRange(F, SmVersion);
-}
-
-NVVMIntrRangePass::NVVMIntrRangePass() : NVVMIntrRangePass(NVVMIntrRangeSM) {}
+bool NVVMIntrRange::runOnFunction(Function &F) { return runNVVMIntrRange(F); }
PreservedAnalyses NVVMIntrRangePass::run(Function &F,
FunctionAnalysisManager &AM) {
- return runNVVMIntrRange(F, SmVersion) ? PreservedAnalyses::none()
- : PreservedAnalyses::all();
+ return runNVVMIntrRange(F) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
}
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index f4e84ade3b5a..bc0ae7a32c05 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1079,13 +1079,13 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
assert(IsAIX && TM.getCodeModel() == CodeModel::Small &&
"PseudoOp only valid for small code model AIX");
- // Transform %rN = ADDItoc/8 @op1, %r2.
+ // Transform %rN = ADDItoc/8 %r2, @op1.
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
// Change the opcode to load address.
TmpInst.setOpcode((!IsPPC64) ? (PPC::LA) : (PPC::LA8));
- const MachineOperand &MO = MI->getOperand(1);
+ const MachineOperand &MO = MI->getOperand(2);
assert(MO.isGlobal() && "Invalid operand for ADDItoc[8].");
// Map the operand to its corresponding MCSymbol.
@@ -1094,7 +1094,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
const MCExpr *Exp =
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_None, OutContext);
- TmpInst.getOperand(1) = TmpInst.getOperand(2);
TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 735050641adf..a07954bd0d8b 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -2080,13 +2080,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
cast<GlobalVariable>(GV)->hasAttribute("toc-data");
// For small code model, generate a simple TOC load.
- if (CModel == CodeModel::Small)
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
- IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc),
- DestReg)
- .addGlobalAddress(GV)
- .addReg(PPC::X2);
- else {
+ if (CModel == CodeModel::Small) {
+ auto MIB = BuildMI(
+ *FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+ IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc), DestReg);
+ if (IsAIXTocData)
+ MIB.addReg(PPC::X2).addGlobalAddress(GV);
+ else
+ MIB.addGlobalAddress(GV).addReg(PPC::X2);
+ } else {
// If the address is an externally defined symbol, a symbol with common
// or externally available linkage, a non-local function address, or a
// jump table address (not yet needed), or if we are generating code
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 275b3337a276..1a69d1e89313 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -6102,8 +6102,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
EVT OperandTy) {
SDValue GA = TocEntry->getOperand(0);
SDValue TocBase = TocEntry->getOperand(1);
- SDNode *MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase);
- transferMemOperands(TocEntry, MN);
+ SDNode *MN = nullptr;
+ if (OpCode == PPC::ADDItoc || OpCode == PPC::ADDItoc8)
+ // toc-data access doesn't involve in loading from got, no need to
+ // keep memory operands.
+ MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, TocBase, GA);
+ else {
+ MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase);
+ transferMemOperands(TocEntry, MN);
+ }
ReplaceNode(TocEntry, MN);
};
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 9af8ada78376..eda5eb975e70 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1485,11 +1485,9 @@ def ADDItocL8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:
}
// Local Data Transform
-def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
+def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
"#ADDItoc8",
- [(set i64:$rD,
- (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64;
-
+ []>, isPPC64;
let mayLoad = 1 in
def LDtocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
"#LDtocL", []>, isPPC64;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index df6b2bf1a7b7..09f829943528 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -3345,10 +3345,8 @@ def LWZtocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc_nor
def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp),
"#ADDIStocHA", []>;
// TOC Data Transform on AIX
-def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
- "#ADDItoc",
- [(set i32:$rD,
- (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$reg, tocentry32:$disp),
+ "#ADDItoc", []>;
def ADDItocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp),
"#ADDItocL", []>;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index a96768240a93..82358cdd45ed 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -932,11 +932,11 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
"Can't handle X0, X0 vsetvli yet");
if (AVLReg == RISCV::X0)
NewInfo.setAVLVLMAX();
- else if (VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS))
- NewInfo.setAVLRegDef(VNI, AVLReg);
- else {
- assert(MI.getOperand(1).isUndef());
+ else if (MI.getOperand(1).isUndef())
NewInfo.setAVLIgnored();
+ else {
+ VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS);
+ NewInfo.setAVLRegDef(VNI, AVLReg);
}
}
NewInfo.setVTYPE(MI.getOperand(2).getImm());
@@ -1008,11 +1008,11 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
}
else
InstrInfo.setAVLImm(Imm);
- } else if (VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS)) {
- InstrInfo.setAVLRegDef(VNI, VLOp.getReg());
- } else {
- assert(VLOp.isUndef());
+ } else if (VLOp.isUndef()) {
InstrInfo.setAVLIgnored();
+ } else {
+ VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS);
+ InstrInfo.setAVLRegDef(VNI, VLOp.getReg());
}
} else {
assert(isScalarExtractInstr(MI));
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 6d926ce551e0..b0949f5fc1d7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -1033,6 +1033,22 @@ class VPseudoUnaryNoMask<DAGOperand RetClass,
let HasVecPolicyOp = 1;
}
+class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass,
+ DAGOperand OpClass,
+ string Constraint = "",
+ int TargetConstraintType = 1> :
+ Pseudo<(outs RetClass:$rd),
+ (ins OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let Constraints = Constraint;
+ let TargetOverlapConstraintType = TargetConstraintType;
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+}
+
class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass,
DAGOperand OpClass,
string Constraint = "",
@@ -1422,24 +1438,6 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
let UsesVXRM = 0;
}
-// Like VPseudoBinaryNoMask, but output can be V0.
-class VPseudoBinaryMOutNoMask<VReg RetClass,
- VReg Op1Class,
- DAGOperand Op2Class,
- string Constraint,
- int TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
- RISCVVPseudo {
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let Constraints = Constraint;
- let TargetOverlapConstraintType = TargetConstraintType;
- let HasVLOp = 1;
- let HasSEWOp = 1;
-}
-
// Like VPseudoBinaryMask, but output can be V0.
class VPseudoBinaryMOutMask<VReg RetClass,
RegisterClass Op1Class,
@@ -2056,9 +2054,10 @@ multiclass VPseudoVSFS_M {
foreach mti = AllMasks in {
defvar mx = mti.LMul.MX;
let VLMul = mti.LMul.value in {
- def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>,
+ def "_M_" # mti.BX : VPseudoUnaryNoMaskNoPolicy<VR, VR, constraint>,
SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx,
forceMergeOpRead=true>;
+ let ForceTailAgnostic = true in
def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>,
SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx,
forceMergeOpRead=true>;
@@ -2172,8 +2171,8 @@ multiclass VPseudoBinaryM<VReg RetClass,
int TargetConstraintType = 1,
bit Commutable = 0> {
let VLMul = MInfo.value, isCommutable = Commutable in {
- def "_" # MInfo.MX : VPseudoBinaryMOutNoMask<RetClass, Op1Class, Op2Class,
- Constraint, TargetConstraintType>;
+ def "_" # MInfo.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
+ Constraint, TargetConstraintType>;
let ForceTailAgnostic = true in
def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMOutMask<RetClass, Op1Class,
Op2Class, Constraint, TargetConstraintType>,
@@ -4078,9 +4077,8 @@ class VPatMaskUnaryNoMask<string intrinsic_name,
(mti.Mask VR:$rs2),
VLOpFrag)),
(!cast<Instruction>(inst#"_M_"#mti.BX)
- (mti.Mask (IMPLICIT_DEF)),
(mti.Mask VR:$rs2),
- GPR:$vl, mti.Log2SEW, TA_MA)>;
+ GPR:$vl, mti.Log2SEW)>;
class VPatMaskUnaryMask<string intrinsic_name,
string inst,
@@ -4153,27 +4151,6 @@ class VPatBinaryNoMaskTU<string intrinsic_name,
(op2_type op2_kind:$rs2),
GPR:$vl, sew, TU_MU)>;
-class VPatBinaryNoMaskRoundingMode<string intrinsic_name,
- string inst,
- ValueType result_type,
- ValueType op1_type,
- ValueType op2_type,
- int sew,
- VReg op1_reg_class,
- DAGOperand op2_kind> :
- Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
- (result_type (undef)),
- (op1_type op1_reg_class:$rs1),
- (op2_type op2_kind:$rs2),
- (XLenVT timm:$round),
- VLOpFrag)),
- (!cast<Instruction>(inst)
- (result_type (IMPLICIT_DEF)),
- (op1_type op1_reg_class:$rs1),
- (op2_type op2_kind:$rs2),
- (XLenVT timm:$round),
- GPR:$vl, sew, TA_MA)>;
-
class VPatBinaryNoMaskTURoundingMode<string intrinsic_name,
string inst,
ValueType result_type,
@@ -4827,8 +4804,6 @@ multiclass VPatBinaryRoundingMode<string intrinsic,
VReg result_reg_class,
VReg op1_reg_class,
DAGOperand op2_kind> {
- def : VPatBinaryNoMaskRoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
- sew, op1_reg_class, op2_kind>;
def : VPatBinaryNoMaskTURoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
sew, result_reg_class, op1_reg_class, op2_kind>;
def : VPatBinaryMaskTARoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
@@ -6962,12 +6937,12 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
foreach vti = AllIntegerVectors in {
// Emit shift by 1 as an add since it might be faster.
let Predicates = GetVTypePredicates<vti>.Predicates in {
- def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector undef),
+ def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$merge),
(vti.Vector vti.RegClass:$rs1),
(XLenVT 1), VLOpFrag)),
(!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX)
- (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1,
- vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TA_MA)>;
+ vti.RegClass:$merge, vti.RegClass:$rs1,
+ vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TU_MU)>;
def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$merge),
(vti.Vector vti.RegClass:$rs1),
(XLenVT 1),
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 956b851fce6c..49838e685a6d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1459,11 +1459,22 @@ static bool generateImageSizeQueryInst(const SPIRV::IncomingCall *Call,
Component == 3 ? NumActualRetComponents - 1 : Component;
assert(ExtractedComposite < NumActualRetComponents &&
"Invalid composite index!");
+ Register TypeReg = GR->getSPIRVTypeID(Call->ReturnType);
+ SPIRVType *NewType = nullptr;
+ if (QueryResultType->getOpcode() == SPIRV::OpTypeVector) {
+ Register NewTypeReg = QueryResultType->getOperand(1).getReg();
+ if (TypeReg != NewTypeReg &&
+ (NewType = GR->getSPIRVTypeForVReg(NewTypeReg)) != nullptr)
+ TypeReg = NewTypeReg;
+ }
MIRBuilder.buildInstr(SPIRV::OpCompositeExtract)
.addDef(Call->ReturnRegister)
- .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+ .addUse(TypeReg)
.addUse(QueryResult)
.addImm(ExtractedComposite);
+ if (NewType != nullptr)
+ insertAssignInstr(Call->ReturnRegister, nullptr, NewType, GR, MIRBuilder,
+ MIRBuilder.getMF().getRegInfo());
} else {
// More than 1 component is expected, fill a new vector.
auto MIB = MIRBuilder.buildInstr(SPIRV::OpVectorShuffle)
@@ -2063,16 +2074,30 @@ static bool generateAsyncCopy(const SPIRV::IncomingCall *Call,
auto Scope = buildConstantIntReg(SPIRV::Scope::Workgroup, MIRBuilder, GR);
switch (Opcode) {
- case SPIRV::OpGroupAsyncCopy:
- return MIRBuilder.buildInstr(Opcode)
- .addDef(Call->ReturnRegister)
- .addUse(GR->getSPIRVTypeID(Call->ReturnType))
- .addUse(Scope)
- .addUse(Call->Arguments[0])
- .addUse(Call->Arguments[1])
- .addUse(Call->Arguments[2])
- .addUse(buildConstantIntReg(1, MIRBuilder, GR))
- .addUse(Call->Arguments[3]);
+ case SPIRV::OpGroupAsyncCopy: {
+ SPIRVType *NewType =
+ Call->ReturnType->getOpcode() == SPIRV::OpTypeEvent
+ ? nullptr
+ : GR->getOrCreateSPIRVTypeByName("spirv.Event", MIRBuilder);
+ Register TypeReg = GR->getSPIRVTypeID(NewType ? NewType : Call->ReturnType);
+ unsigned NumArgs = Call->Arguments.size();
+ Register EventReg = Call->Arguments[NumArgs - 1];
+ bool Res = MIRBuilder.buildInstr(Opcode)
+ .addDef(Call->ReturnRegister)
+ .addUse(TypeReg)
+ .addUse(Scope)
+ .addUse(Call->Arguments[0])
+ .addUse(Call->Arguments[1])
+ .addUse(Call->Arguments[2])
+ .addUse(Call->Arguments.size() > 4
+ ? Call->Arguments[3]
+ : buildConstantIntReg(1, MIRBuilder, GR))
+ .addUse(EventReg);
+ if (NewType != nullptr)
+ insertAssignInstr(Call->ReturnRegister, nullptr, NewType, GR, MIRBuilder,
+ MIRBuilder.getMF().getRegInfo());
+ return Res;
+ }
case SPIRV::OpGroupWaitEvents:
return MIRBuilder.buildInstr(Opcode)
.addUse(Scope)
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 24c6c2688642..edc9e1a33d9f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -586,6 +586,7 @@ defm : DemangledNativeBuiltin<"__spirv_SpecConstantComposite", OpenCL_std, SpecC
// Async Copy and Prefetch builtin records:
defm : DemangledNativeBuiltin<"async_work_group_copy", OpenCL_std, AsyncCopy, 4, 4, OpGroupAsyncCopy>;
+defm : DemangledNativeBuiltin<"async_work_group_strided_copy", OpenCL_std, AsyncCopy, 5, 5, OpGroupAsyncCopy>;
defm : DemangledNativeBuiltin<"__spirv_GroupAsyncCopy", OpenCL_std, AsyncCopy, 6, 6, OpGroupAsyncCopy>;
defm : DemangledNativeBuiltin<"wait_group_events", OpenCL_std, AsyncCopy, 2, 2, OpGroupWaitEvents>;
defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy, 3, 3, OpGroupWaitEvents>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 5ef0be1cab72..bbd25dc85f52 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -61,9 +61,6 @@ class SPIRVEmitIntrinsics
DenseMap<Instruction *, Type *> AggrConstTypes;
DenseSet<Instruction *> AggrStores;
- // a registry of created Intrinsic::spv_assign_ptr_type instructions
- DenseMap<Value *, CallInst *> AssignPtrTypeInstr;
-
// deduce element type of untyped pointers
Type *deduceElementType(Value *I);
Type *deduceElementTypeHelper(Value *I);
@@ -98,14 +95,16 @@ class SPIRVEmitIntrinsics
return B.CreateIntrinsic(IntrID, {Types}, Args);
}
+ void buildAssignType(IRBuilder<> &B, Type *ElemTy, Value *Arg);
void buildAssignPtr(IRBuilder<> &B, Type *ElemTy, Value *Arg);
+ void updateAssignType(CallInst *AssignCI, Value *Arg, Value *OfType);
void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B);
void processInstrAfterVisit(Instruction *I, IRBuilder<> &B);
void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B);
void insertAssignTypeIntrs(Instruction *I, IRBuilder<> &B);
- void insertAssignTypeInstrForTargetExtTypes(TargetExtType *AssignedType,
- Value *V, IRBuilder<> &B);
+ void insertAssignPtrTypeTargetExt(TargetExtType *AssignedType, Value *V,
+ IRBuilder<> &B);
void replacePointerOperandWithPtrCast(Instruction *I, Value *Pointer,
Type *ExpectedElementType,
unsigned OperandToReplace,
@@ -218,15 +217,39 @@ static inline void reportFatalOnTokenType(const Instruction *I) {
false);
}
+void SPIRVEmitIntrinsics::buildAssignType(IRBuilder<> &B, Type *Ty,
+ Value *Arg) {
+ Value *OfType = PoisonValue::get(Ty);
+ CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type,
+ {Arg->getType()}, OfType, Arg, {}, B);
+ GR->addAssignPtrTypeInstr(Arg, AssignCI);
+}
+
void SPIRVEmitIntrinsics::buildAssignPtr(IRBuilder<> &B, Type *ElemTy,
Value *Arg) {
- CallInst *AssignPtrTyCI =
- buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Arg->getType()},
- Constant::getNullValue(ElemTy), Arg,
- {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B);
+ Value *OfType = PoisonValue::get(ElemTy);
+ CallInst *AssignPtrTyCI = buildIntrWithMD(
+ Intrinsic::spv_assign_ptr_type, {Arg->getType()}, OfType, Arg,
+ {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B);
GR->addDeducedElementType(AssignPtrTyCI, ElemTy);
GR->addDeducedElementType(Arg, ElemTy);
- AssignPtrTypeInstr[Arg] = AssignPtrTyCI;
+ GR->addAssignPtrTypeInstr(Arg, AssignPtrTyCI);
+}
+
+void SPIRVEmitIntrinsics::updateAssignType(CallInst *AssignCI, Value *Arg,
+ Value *OfType) {
+ LLVMContext &Ctx = Arg->getContext();
+ AssignCI->setArgOperand(
+ 1, MetadataAsValue::get(
+ Ctx, MDNode::get(Ctx, ValueAsMetadata::getConstant(OfType))));
+ if (cast<IntrinsicInst>(AssignCI)->getIntrinsicID() !=
+ Intrinsic::spv_assign_ptr_type)
+ return;
+
+ // update association with the pointee type
+ Type *ElemTy = OfType->getType();
+ GR->addDeducedElementType(AssignCI, ElemTy);
+ GR->addDeducedElementType(Arg, ElemTy);
}
// Set element pointer type to the given value of ValueTy and tries to
@@ -513,19 +536,16 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(Instruction *I) {
if (!Ty) {
GR->addDeducedElementType(Op, KnownElemTy);
// check if there is existing Intrinsic::spv_assign_ptr_type instruction
- auto It = AssignPtrTypeInstr.find(Op);
- if (It == AssignPtrTypeInstr.end()) {
+ CallInst *AssignCI = GR->findAssignPtrTypeInstr(Op);
+ if (AssignCI == nullptr) {
Instruction *User = dyn_cast<Instruction>(Op->use_begin()->get());
setInsertPointSkippingPhis(B, User ? User->getNextNode() : I);
CallInst *CI =
buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {OpTy}, OpTyVal, Op,
{B.getInt32(getPointerAddressSpace(OpTy))}, B);
- AssignPtrTypeInstr[Op] = CI;
+ GR->addAssignPtrTypeInstr(Op, CI);
} else {
- It->second->setArgOperand(
- 1,
- MetadataAsValue::get(
- Ctx, MDNode::get(Ctx, ValueAsMetadata::getConstant(OpTyVal))));
+ updateAssignType(AssignCI, Op, OpTyVal);
}
} else {
if (auto *OpI = dyn_cast<Instruction>(Op)) {
@@ -559,7 +579,9 @@ void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old,
if (isAssignTypeInstr(U)) {
B.SetInsertPoint(U);
SmallVector<Value *, 2> Args = {New, U->getOperand(1)};
- B.CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args);
+ CallInst *AssignCI =
+ B.CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args);
+ GR->addAssignPtrTypeInstr(New, AssignCI);
U->eraseFromParent();
} else if (isMemInstrToReplace(U) || isa<ReturnInst>(U) ||
isa<CallInst>(U)) {
@@ -751,33 +773,39 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
return NewI;
}
-void SPIRVEmitIntrinsics::insertAssignTypeInstrForTargetExtTypes(
+void SPIRVEmitIntrinsics::insertAssignPtrTypeTargetExt(
TargetExtType *AssignedType, Value *V, IRBuilder<> &B) {
- // Do not emit spv_assign_type if the V is of the AssignedType already.
- if (V->getType() == AssignedType)
- return;
+ Type *VTy = V->getType();
- // Do not emit spv_assign_type if there is one already targetting V. If the
- // found spv_assign_type assigns a type different than AssignedType, report an
- // error. Builtin types cannot be redeclared or casted.
- for (auto User : V->users()) {
- auto *II = dyn_cast<IntrinsicInst>(User);
- if (!II || II->getIntrinsicID() != Intrinsic::spv_assign_type)
- continue;
+ // A couple of sanity checks.
+ assert(isPointerTy(VTy) && "Expect a pointer type!");
+ if (auto PType = dyn_cast<TypedPointerType>(VTy))
+ if (PType->getElementType() != AssignedType)
+ report_fatal_error("Unexpected pointer element type!");
- MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1));
- Type *BuiltinType =
- dyn_cast<ConstantAsMetadata>(VMD->getMetadata())->getType();
- if (BuiltinType != AssignedType)
- report_fatal_error("Type mismatch " + BuiltinType->getTargetExtName() +
- "/" + AssignedType->getTargetExtName() +
- " for value " + V->getName(),
- false);
+ CallInst *AssignCI = GR->findAssignPtrTypeInstr(V);
+ if (!AssignCI) {
+ buildAssignType(B, AssignedType, V);
return;
}
- Constant *Const = UndefValue::get(AssignedType);
- buildIntrWithMD(Intrinsic::spv_assign_type, {V->getType()}, Const, V, {}, B);
+ Type *CurrentType =
+ dyn_cast<ConstantAsMetadata>(
+ cast<MetadataAsValue>(AssignCI->getOperand(1))->getMetadata())
+ ->getType();
+ if (CurrentType == AssignedType)
+ return;
+
+ // Builtin types cannot be redeclared or casted.
+ if (CurrentType->isTargetExtTy())
+ report_fatal_error("Type mismatch " + CurrentType->getTargetExtName() +
+ "/" + AssignedType->getTargetExtName() +
+ " for value " + V->getName(),
+ false);
+
+ // Our previous guess about the type seems to be wrong, let's update
+ // inferred type according to a new, more precise type information.
+ updateAssignType(AssignCI, V, PoisonValue::get(AssignedType));
}
void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
@@ -850,7 +878,7 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
ExpectedElementTypeConst, Pointer, {B.getInt32(AddressSpace)}, B);
GR->addDeducedElementType(CI, ExpectedElementType);
GR->addDeducedElementType(Pointer, ExpectedElementType);
- AssignPtrTypeInstr[Pointer] = CI;
+ GR->addAssignPtrTypeInstr(Pointer, CI);
return;
}
@@ -929,8 +957,7 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
for (unsigned OpIdx = 0; OpIdx < CI->arg_size(); OpIdx++) {
Value *ArgOperand = CI->getArgOperand(OpIdx);
- if (!isa<PointerType>(ArgOperand->getType()) &&
- !isa<TypedPointerType>(ArgOperand->getType()))
+ if (!isPointerTy(ArgOperand->getType()))
continue;
// Constants (nulls/undefs) are handled in insertAssignPtrTypeIntrs()
@@ -952,8 +979,8 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
continue;
if (ExpectedType->isTargetExtTy())
- insertAssignTypeInstrForTargetExtTypes(cast<TargetExtType>(ExpectedType),
- ArgOperand, B);
+ insertAssignPtrTypeTargetExt(cast<TargetExtType>(ExpectedType),
+ ArgOperand, B);
else
replacePointerOperandWithPtrCast(CI, ArgOperand, ExpectedType, OpIdx, B);
}
@@ -1145,7 +1172,7 @@ void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
CallInst *CI = buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {I->getType()},
EltTyConst, I, {B.getInt32(AddressSpace)}, B);
GR->addDeducedElementType(CI, ElemTy);
- AssignPtrTypeInstr[I] = CI;
+ GR->addAssignPtrTypeInstr(I, CI);
}
void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
@@ -1164,20 +1191,32 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
TypeToAssign = It->second;
}
}
- Constant *Const = UndefValue::get(TypeToAssign);
- buildIntrWithMD(Intrinsic::spv_assign_type, {Ty}, Const, I, {}, B);
+ buildAssignType(B, TypeToAssign, I);
}
for (const auto &Op : I->operands()) {
if (isa<ConstantPointerNull>(Op) || isa<UndefValue>(Op) ||
// Check GetElementPtrConstantExpr case.
(isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) {
setInsertPointSkippingPhis(B, I);
- if (isa<UndefValue>(Op) && Op->getType()->isAggregateType())
- buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op,
- UndefValue::get(B.getInt32Ty()), {}, B);
- else if (!isa<Instruction>(Op))
- buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op, {},
- B);
+ Type *OpTy = Op->getType();
+ if (isa<UndefValue>(Op) && OpTy->isAggregateType()) {
+ CallInst *AssignCI =
+ buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op,
+ UndefValue::get(B.getInt32Ty()), {}, B);
+ GR->addAssignPtrTypeInstr(Op, AssignCI);
+ } else if (!isa<Instruction>(Op)) {
+ Type *OpTy = Op->getType();
+ if (auto PType = dyn_cast<TypedPointerType>(OpTy)) {
+ buildAssignPtr(B, PType->getElementType(), Op);
+ } else if (isPointerTy(OpTy)) {
+ Type *ElemTy = GR->findDeducedElementType(Op);
+ buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op), Op);
+ } else {
+ CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type,
+ {OpTy}, Op, Op, {}, B);
+ GR->addAssignPtrTypeInstr(Op, AssignCI);
+ }
+ }
}
}
}
@@ -1368,14 +1407,12 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
continue;
insertAssignPtrTypeIntrs(I, B);
+ deduceOperandElementType(I);
insertAssignTypeIntrs(I, B);
insertPtrCastOrAssignTypeInstr(I, B);
insertSpirvDecorations(I, B);
}
- for (auto &I : instructions(Func))
- deduceOperandElementType(&I);
-
for (auto *I : Worklist) {
TrackConstants = true;
if (!I->getType()->isVoidTy() || isa<StoreInst>(I))
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index ef0973d03d15..db01f68f48de 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -73,8 +73,11 @@ class SPIRVGlobalRegistry {
// untyped pointers.
DenseMap<Value *, Type *> DeducedElTys;
// Maps composite values to deduced types where untyped pointers are replaced
- // with typed ones
+ // with typed ones.
DenseMap<Value *, Type *> DeducedNestedTys;
+ // Maps values to "assign type" calls, thus being a registry of created
+ // Intrinsic::spv_assign_ptr_type instructions.
+ DenseMap<Value *, CallInst *> AssignPtrTypeInstr;
// Add a new OpTypeXXX instruction without checking for duplicates.
SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
@@ -149,6 +152,17 @@ public:
return It == FunResPointerTypes.end() ? nullptr : It->second;
}
+ // A registry of "assign type" records:
+ // - Add a record.
+ void addAssignPtrTypeInstr(Value *Val, CallInst *AssignPtrTyCI) {
+ AssignPtrTypeInstr[Val] = AssignPtrTyCI;
+ }
+ // - Find a record.
+ CallInst *findAssignPtrTypeInstr(const Value *Val) {
+ auto It = AssignPtrTypeInstr.find(Val);
+ return It == AssignPtrTypeInstr.end() ? nullptr : It->second;
+ }
+
// Deduced element types of untyped pointers and composites:
// - Add a record to the map of deduced element types.
void addDeducedElementType(Value *Val, Type *Ty) { DeducedElTys[Val] = Ty; }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 3d536085b78a..a0a253c23b1e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -417,7 +417,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MachineInstr *Def = MRI.getVRegDef(Reg);
assert(Def && "Expecting an instruction that defines the register");
// G_GLOBAL_VALUE already has type info.
- if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE)
+ if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE &&
+ Def->getOpcode() != SPIRV::ASSIGN_TYPE)
insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB,
MF.getRegInfo());
ToErase.push_back(&MI);
@@ -427,7 +428,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MachineInstr *Def = MRI.getVRegDef(Reg);
assert(Def && "Expecting an instruction that defines the register");
// G_GLOBAL_VALUE already has type info.
- if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE)
+ if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE &&
+ Def->getOpcode() != SPIRV::ASSIGN_TYPE)
insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MF.getRegInfo());
ToErase.push_back(&MI);
} else if (MIOp == TargetOpcode::G_CONSTANT ||
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 8e2063121e00..f5bc584ac4e1 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -178,14 +178,15 @@ static wasm::WasmLimits DefaultLimits() {
}
static MCSymbolWasm *GetOrCreateFunctionTableSymbol(MCContext &Ctx,
- const StringRef &Name) {
+ const StringRef &Name,
+ bool is64) {
MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
if (Sym) {
if (!Sym->isFunctionTable())
Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
} else {
Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
- Sym->setFunctionTable();
+ Sym->setFunctionTable(is64);
// The default function table is synthesized by the linker.
Sym->setUndefined();
}
@@ -258,7 +259,7 @@ public:
MCAsmParserExtension::Initialize(Parser);
DefaultFunctionTable = GetOrCreateFunctionTableSymbol(
- getContext(), "__indirect_function_table");
+ getContext(), "__indirect_function_table", is64);
if (!STI->checkFeatures("+reference-types"))
DefaultFunctionTable->setOmitFromLinkingSection();
}
@@ -508,7 +509,7 @@ public:
auto &Tok = Lexer.getTok();
if (Tok.is(AsmToken::Identifier)) {
auto *Sym =
- GetOrCreateFunctionTableSymbol(getContext(), Tok.getString());
+ GetOrCreateFunctionTableSymbol(getContext(), Tok.getString(), is64);
const auto *Val = MCSymbolRefExpr::create(Sym, getContext());
*Op = std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Symbol, Tok.getLoc(), Tok.getEndLoc(),
@@ -836,6 +837,9 @@ public:
// symbol
auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
+ if (is64) {
+ Limits.Flags |= wasm::WASM_LIMITS_FLAG_IS_64;
+ }
wasm::WasmTableType Type = {*ElemType, Limits};
WasmSym->setTableType(Type);
TOut.emitTableType(WasmSym);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index 5e7279808cce..c5a047ee47d7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -108,8 +108,9 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol(
if (!Sym->isFunctionTable())
Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
} else {
+ bool is64 = Subtarget && Subtarget->getTargetTriple().isArch64Bit();
Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
- Sym->setFunctionTable();
+ Sym->setFunctionTable(is64);
// The default function table is synthesized by the linker.
Sym->setUndefined();
}
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 0bf3294af92a..3933e82b718f 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5120,6 +5120,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::x86_tileloaddt164_internal: {
if (!Subtarget->hasAMXTILE())
break;
+ auto *MFI =
+ CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
? X86::PTILELOADDV
: X86::PTILELOADDT1V;
@@ -5201,6 +5204,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
case Intrinsic::x86_tilestored64_internal: {
+ auto *MFI =
+ CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
unsigned Opc = X86::PTILESTOREDV;
// _tile_stored_internal(row, col, buf, STRIDE, c)
SDValue Base = Node->getOperand(4);
@@ -5228,6 +5234,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::x86_tilestored64: {
if (!Subtarget->hasAMXTILE())
break;
+ auto *MFI =
+ CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Unexpected intrinsic!");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7d30de15f84d..3fbab3af32bb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -615,6 +615,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSIN, VT, Action);
setOperationAction(ISD::FCOS, VT, Action);
setOperationAction(ISD::FSINCOS, VT, Action);
+ setOperationAction(ISD::FTAN, VT, Action);
setOperationAction(ISD::FSQRT, VT, Action);
setOperationAction(ISD::FPOW, VT, Action);
setOperationAction(ISD::FLOG, VT, Action);
@@ -833,9 +834,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Always expand sin/cos functions even though x87 has an instruction.
+ // clang-format off
setOperationAction(ISD::FSIN , MVT::f80, Expand);
setOperationAction(ISD::FCOS , MVT::f80, Expand);
setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+ setOperationAction(ISD::FTAN , MVT::f80, Expand);
+ // clang-format on
setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
setOperationAction(ISD::FCEIL, MVT::f80, Expand);
@@ -888,11 +892,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEG, MVT::f128, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+ // clang-format off
setOperationAction(ISD::FSIN, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
setOperationAction(ISD::FCOS, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
+ setOperationAction(ISD::FTAN, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FTAN, MVT::f128, LibCall);
+ // clang-format on
// No STRICT_FSINCOS
setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
@@ -944,9 +952,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
MVT::v4f32, MVT::v8f32, MVT::v16f32,
MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
+ // clang-format off
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FTAN, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
@@ -956,6 +966,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
setOperationAction(ISD::FEXP10, VT, Expand);
+ // clang-format on
}
// First set operation action for all vector types to either promote
@@ -2473,7 +2484,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// function casting to f64 and calling `fmod`.
if (Subtarget.is32Bit() &&
(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
- for (ISD::NodeType Op :
+ // clang-format off
+ for (ISD::NodeType Op :
{ISD::FCEIL, ISD::STRICT_FCEIL,
ISD::FCOS, ISD::STRICT_FCOS,
ISD::FEXP, ISD::STRICT_FEXP,
@@ -2482,9 +2494,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::FLOG, ISD::STRICT_FLOG,
ISD::FLOG10, ISD::STRICT_FLOG10,
ISD::FPOW, ISD::STRICT_FPOW,
- ISD::FSIN, ISD::STRICT_FSIN})
+ ISD::FSIN, ISD::STRICT_FSIN,
+ ISD::FTAN, ISD::STRICT_FTAN})
if (isOperationExpand(Op, MVT::f32))
setOperationAction(Op, MVT::f32, Promote);
+ // clang-format on
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
@@ -26776,7 +26790,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
case Intrinsic::swift_async_context_addr: {
SDLoc dl(Op);
auto &MF = DAG.getMachineFunction();
- auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
MF.getFrameInfo().setFrameAddressIsTaken(true);
X86FI->setHasSwiftAsyncContext(true);
@@ -36781,7 +36795,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
case TargetOpcode::PREALLOCATED_SETUP: {
assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
- auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
MFI->setHasPreallocatedCall(true);
int64_t PreallocatedId = MI.getOperand(0).getImm();
size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
@@ -36798,7 +36812,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
int64_t PreallocatedId = MI.getOperand(1).getImm();
int64_t ArgIdx = MI.getOperand(2).getImm();
- auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
<< ", arg offset " << ArgOffset << "\n");
@@ -36841,6 +36855,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
unsigned Imm = MI.getOperand(0).getImm();
BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
MI.eraseFromParent(); // The pseudo is gone now.
+ auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
+ return BB;
+ }
+ case X86::PTILEZEROV: {
+ auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
return BB;
}
case X86::PTILELOADD:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index c47bee070e04..99deacc811a1 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -74,7 +74,7 @@ let SchedRW = [WriteSystem] in {
GR16:$src2, opaquemem:$src3,
TILE:$src4), []>;
let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1,
- canFoldAsLoad = 1 in
+ canFoldAsLoad = 1, usesCustomInserter = 1 in
def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2),
[(set TILE:$dst, (int_x86_tilezero_internal
GR16:$src1, GR16:$src2))]>;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index b69058787a4e..079ac983a8a0 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -92,6 +92,14 @@ static bool isAMXIntrinsic(Value *I) {
return false;
}
+static bool containsAMXCode(Function &F) {
+ for (BasicBlock &BB : F)
+ for (Instruction &I : BB)
+ if (I.getType()->isX86_AMXTy())
+ return true;
+ return false;
+}
+
static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB,
Type *Ty) {
Function &F = *BB->getParent();
@@ -1230,6 +1238,14 @@ public:
}
bool runOnFunction(Function &F) override {
+ // Performance optimization: most code doesn't use AMX, so return early if
+ // there are no instructions that produce AMX values. This is sufficient, as
+ // AMX arguments and constants are not allowed -- so any producer of an AMX
+ // value must be an instruction.
+ // TODO: find a cheaper way for this, without looking at all instructions.
+ if (!containsAMXCode(F))
+ return false;
+
bool C = false;
TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
TargetLibraryInfo *TLI =
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index f27676a27e86..613722b398f4 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -19,6 +19,7 @@
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -71,6 +72,10 @@ FunctionPass *llvm::createX86LowerTileCopyPass() {
}
bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ if (FuncInfo->getAMXProgModel() != AMXProgModelEnum::ManagedRA)
+ return false;
+
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
const X86InstrInfo *TII = ST.getInstrInfo();
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
@@ -81,26 +86,8 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
- // There won't be a tile copy if neither tile register live in nor live out.
- bool HasTileCopy = false;
- for (const auto &LI : MBB.liveins()) {
- if (TILERegs.test(LI.PhysReg)) {
- HasTileCopy = true;
- break;
- }
- }
LiveRegUnits UsedRegs(*TRI);
UsedRegs.addLiveOuts(MBB);
- if (!HasTileCopy) {
- for (auto RegT : TILERegs.set_bits()) {
- if (UsedRegs.available(RegT)) {
- HasTileCopy = true;
- break;
- }
- }
- }
- if (!HasTileCopy)
- continue;
for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
UsedRegs.stepBackward(MI);
if (!MI.isCopy())
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index f6e853270e07..8aaa49945f9d 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -21,6 +21,8 @@
namespace llvm {
+enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 };
+
/// X86MachineFunctionInfo - This class is derived from MachineFunction and
/// contains private X86 target-specific information for each MachineFunction.
class X86MachineFunctionInfo : public MachineFunctionInfo {
@@ -96,6 +98,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// used to address arguments in a function using a base pointer.
int SEHFramePtrSaveIndex = 0;
+ /// The AMX programing model used in the function.
+ AMXProgModelEnum AMXProgModel = AMXProgModelEnum::None;
+
/// True if this function has a subset of CSRs that is handled explicitly via
/// copies.
bool IsSplitCSR = false;
@@ -219,6 +224,13 @@ public:
int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
+ AMXProgModelEnum getAMXProgModel() const { return AMXProgModel; }
+ void setAMXProgModel(AMXProgModelEnum Model) {
+ assert((AMXProgModel == AMXProgModelEnum::None || AMXProgModel == Model) &&
+ "mixed model is not supported");
+ AMXProgModel = Model;
+ }
+
SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
return ForwardedMustTailRegParms;
}
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 2d296771b1c0..186d4d84c251 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -620,11 +620,11 @@ def : WriteRes<WriteNop, []>;
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : ICXWriteResPair<WriteFHAdd, [ICXPort5,ICXPort015], 6, [2,1], 3, 6>;
-defm : ICXWriteResPair<WriteFHAddY, [ICXPort5,ICXPort015], 6, [2,1], 3, 7>;
+defm : ICXWriteResPair<WriteFHAdd, [ICXPort5,ICXPort01], 6, [2,1], 3, 6>;
+defm : ICXWriteResPair<WriteFHAddY, [ICXPort5,ICXPort01], 6, [2,1], 3, 7>;
defm : ICXWriteResPair<WritePHAdd, [ICXPort5,ICXPort05], 3, [2,1], 3, 5>;
-defm : ICXWriteResPair<WritePHAddX, [ICXPort5,ICXPort015], 3, [2,1], 3, 6>;
-defm : ICXWriteResPair<WritePHAddY, [ICXPort5,ICXPort015], 3, [2,1], 3, 7>;
+defm : ICXWriteResPair<WritePHAddX, [ICXPort15,ICXPort015], 3, [2,1], 3, 6>;
+defm : ICXWriteResPair<WritePHAddY, [ICXPort15,ICXPort015], 3, [2,1], 3, 7>;
// Remaining instrs.
@@ -886,7 +886,7 @@ def ICXWriteResGroup37 : SchedWriteRes<[ICXPort0,ICXPort5]> {
}
def: InstRW<[ICXWriteResGroup37], (instregex "MMX_PH(ADD|SUB)SWrr")>;
-def ICXWriteResGroup38 : SchedWriteRes<[ICXPort5,ICXPort01]> {
+def ICXWriteResGroup38 : SchedWriteRes<[ICXPort15,ICXPort01]> {
let Latency = 3;
let NumMicroOps = 3;
let ReleaseAtCycles = [2,1];
@@ -1739,13 +1739,13 @@ def ICXWriteResGroup137 : SchedWriteRes<[ICXPort23,ICXPort01]> {
def: InstRW<[ICXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIrm",
"(V?)CVTPS2PDrm")>;
-def ICXWriteResGroup143 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> {
+def ICXWriteResGroup143 : SchedWriteRes<[ICXPort15,ICXPort01,ICXPort23]> {
let Latency = 9;
let NumMicroOps = 4;
let ReleaseAtCycles = [2,1,1];
}
-def: InstRW<[ICXWriteResGroup143], (instregex "(V?)PHADDSWrm",
- "(V?)PHSUBSWrm")>;
+def: InstRW<[ICXWriteResGroup143], (instrs PHADDSWrm, VPHADDSWrm,
+ PHSUBSWrm, VPHSUBSWrm)>;
def ICXWriteResGroup146 : SchedWriteRes<[ICXPort1,ICXPort6,ICXPort23,ICXPort0156]> {
let Latency = 9;
@@ -1842,7 +1842,7 @@ def: InstRW<[ICXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)",
"VPEXPANDDZ128rm(b?)",
"VPEXPANDQZ128rm(b?)")>;
-def ICXWriteResGroup154 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> {
+def ICXWriteResGroup154 : SchedWriteRes<[ICXPort15,ICXPort01,ICXPort23]> {
let Latency = 10;
let NumMicroOps = 4;
let ReleaseAtCycles = [2,1,1];
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index a7dff0ecbcd9..4fded44085e8 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -615,8 +615,8 @@ def : WriteRes<WriteNop, []>;
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort015], 6, [2,1], 3, 6>;
-defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort015], 6, [2,1], 3, 7>;
+defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort01], 6, [2,1], 3, 6>;
+defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort01], 6, [2,1], 3, 7>;
defm : SKXWriteResPair<WritePHAdd, [SKXPort5,SKXPort05], 3, [2,1], 3, 5>;
defm : SKXWriteResPair<WritePHAddX, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>;
defm : SKXWriteResPair<WritePHAddY, [SKXPort5,SKXPort015], 3, [2,1], 3, 7>;
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 68155acd9e5b..b3b8486c604b 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -302,6 +302,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
.Case("0x805", "cortex-a76") // Kryo 4xx/5xx Silver
.Case("0xc00", "falkor")
.Case("0xc01", "saphira")
+ .Case("0x001", "oryon-1")
.Default("generic");
if (Implementer == "0x53") { // Samsung Electronics Co., Ltd.
// The Exynos chips have a convoluted ID scheme that doesn't seem to follow
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 7464237d26d4..60a784ef002f 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -124,6 +124,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
{{"gfx1103"}, {"gfx1103"}, GK_GFX1103, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
{{"gfx1150"}, {"gfx1150"}, GK_GFX1150, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
{{"gfx1151"}, {"gfx1151"}, GK_GFX1151, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
+ {{"gfx1152"}, {"gfx1152"}, GK_GFX1152, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
{{"gfx1200"}, {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
{{"gfx1201"}, {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
@@ -275,6 +276,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
case GK_GFX1103: return {11, 0, 3};
case GK_GFX1150: return {11, 5, 0};
case GK_GFX1151: return {11, 5, 1};
+ case GK_GFX1152: return {11, 5, 2};
case GK_GFX1200: return {12, 0, 0};
case GK_GFX1201: return {12, 0, 1};
@@ -341,6 +343,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["image-insts"] = true;
Features["fp8-conversion-insts"] = true;
break;
+ case GK_GFX1152:
case GK_GFX1151:
case GK_GFX1150:
case GK_GFX1103:
@@ -542,6 +545,7 @@ static bool isWave32Capable(StringRef GPU, const Triple &T) {
switch (parseArchAMDGCN(GPU)) {
case GK_GFX1201:
case GK_GFX1200:
+ case GK_GFX1152:
case GK_GFX1151:
case GK_GFX1150:
case GK_GFX1103:
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 9a5732dca5b7..549d03645f93 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -419,7 +419,8 @@ struct AAReturnedFromReturnedValues : public BaseType {
/// See AbstractAttribute::updateImpl(...).
ChangeStatus updateImpl(Attributor &A) override {
StateType S(StateType::getBestState(this->getState()));
- clampReturnedValueStates<AAType, StateType, IRAttributeKind, RecurseForSelectAndPHI>(
+ clampReturnedValueStates<AAType, StateType, IRAttributeKind,
+ RecurseForSelectAndPHI>(
A, *this, S,
PropagateCallBaseContext ? this->getCallBaseContext() : nullptr);
// TODO: If we know we visited all returned values, thus no are assumed
@@ -6973,10 +6974,9 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
if (AI.LibraryFunctionId != LibFunc___kmpc_alloc_shared) {
Instruction *CtxI = isa<InvokeInst>(AI.CB) ? AI.CB : AI.CB->getNextNode();
if (!Explorer || !Explorer->findInContextOf(UniqueFree, CtxI)) {
- LLVM_DEBUG(
- dbgs()
- << "[H2S] unique free call might not be executed with the allocation "
- << *UniqueFree << "\n");
+ LLVM_DEBUG(dbgs() << "[H2S] unique free call might not be executed "
+ "with the allocation "
+ << *UniqueFree << "\n");
return false;
}
}
@@ -10406,11 +10406,12 @@ struct AANoFPClassFloating : public AANoFPClassImpl {
struct AANoFPClassReturned final
: AAReturnedFromReturnedValues<AANoFPClass, AANoFPClassImpl,
- AANoFPClassImpl::StateType, false, Attribute::None, false> {
+ AANoFPClassImpl::StateType, false,
+ Attribute::None, false> {
AANoFPClassReturned(const IRPosition &IRP, Attributor &A)
: AAReturnedFromReturnedValues<AANoFPClass, AANoFPClassImpl,
- AANoFPClassImpl::StateType, false, Attribute::None, false>(
- IRP, A) {}
+ AANoFPClassImpl::StateType, false,
+ Attribute::None, false>(IRP, A) {}
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 5fbdbc3a014f..92a9697720ef 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_component_library(LLVMipo
DeadArgumentElimination.cpp
ElimAvailExtern.cpp
EmbedBitcodePass.cpp
+ ExpandVariadics.cpp
ExtractGV.cpp
ForceFunctionAttrs.cpp
FunctionAttrs.cpp
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
new file mode 100644
index 000000000000..d340bc041ccd
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -0,0 +1,1012 @@
+//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a variadic function into a va_list
+// and fix up the call sites. The majority of the pass is target independent.
+// The exceptions are the va_list type itself and the rules for where to store
+// variables in memory such that va_arg can iterate over them given a va_list.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. That packing is
+// exactly what is done by va_start. Further, the transform from ... to va_list
+// replaced va_start with an operation to copy a va_list from the new argument,
+// which is exactly a va_copy. This is useful for reducing target-dependence.
+//
+// A va_list instance is a forward iterator, where the primary operation va_arg
+// is dereference-then-increment. This interface forces significant convergent
+// evolution between target specific implementations. The variation in runtime
+// data layout is limited to that representable by the iterator, parameterised
+// by the type passed to the va_arg instruction.
+//
+// Therefore the majority of the target specific subtlety is packing arguments
+// into a stack allocated buffer such that a va_list can be initialised with it
+// and the va_arg expansion for the target will find the arguments at runtime.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular target.
+//
+// There is one "clever" invariant in use. va_start intrinsics that are not
+// within a varidic functions are an error in the IR verifier. When this
+// transform moves blocks from a variadic function into a fixed arity one, it
+// moves va_start intrinsics along with everything else. That means that the
+// va_start intrinsics that need to be rewritten to use the trailing argument
+// are exactly those that are in non-variadic functions so no further state
+// is needed to distinguish those that need to be rewritten.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+
+cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption(
+ DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+ cl::init(ExpandVariadicsMode::Unspecified),
+ cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified",
+ "Use the implementation defaults"),
+ clEnumValN(ExpandVariadicsMode::Disable, "disable",
+ "Disable the pass entirely"),
+ clEnumValN(ExpandVariadicsMode::Optimize, "optimize",
+ "Optimise without changing ABI"),
+ clEnumValN(ExpandVariadicsMode::Lowering, "lowering",
+ "Change variadic calling convention")));
+
+bool commandLineOverride() {
+ return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified;
+}
+
+// Instances of this class encapsulate the target-dependant behaviour as a
+// function of triple. Implementing a new ABI is adding a case to the switch
+// in create(llvm::Triple) at the end of this file.
+// This class may end up instantiated in TargetMachine instances, keeping it
+// here for now until enough targets are implemented for the API to evolve.
+class VariadicABIInfo {
+protected:
+ VariadicABIInfo() = default;
+
+public:
+ static std::unique_ptr<VariadicABIInfo> create(const Triple &T);
+
+ // Allow overriding whether the pass runs on a per-target basis
+ virtual bool enableForTarget() = 0;
+
+ // Whether a valist instance is passed by value or by address
+ // I.e. does it need to be alloca'ed and stored into, or can
+ // it be passed directly in a SSA register
+ virtual bool vaListPassedInSSARegister() = 0;
+
+ // The type of a va_list iterator object
+ virtual Type *vaListType(LLVMContext &Ctx) = 0;
+
+ // The type of a va_list as a function argument as lowered by C
+ virtual Type *vaListParameterType(Module &M) = 0;
+
+ // Initialize an allocated va_list object to point to an already
+ // initialized contiguous memory region.
+ // Return the value to pass as the va_list argument
+ virtual Value *initializeVaList(Module &M, LLVMContext &Ctx,
+ IRBuilder<> &Builder, AllocaInst *VaList,
+ Value *Buffer) = 0;
+
+ struct VAArgSlotInfo {
+ Align DataAlign; // With respect to the call frame
+ bool Indirect; // Passed via a pointer
+ };
+ virtual VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) = 0;
+
+ // Targets implemented so far all have the same trivial lowering for these
+ bool vaEndIsNop() { return true; }
+ bool vaCopyIsMemcpy() { return true; }
+
+ virtual ~VariadicABIInfo() = default;
+};
+
+// Module implements getFunction() which returns nullptr on missing declaration
+// and getOrInsertFunction which creates one when absent. Intrinsics.h only
+// implements getDeclaration which creates one when missing. Checking whether
+// an intrinsic exists thus inserts it in the module and it then needs to be
+// deleted again to clean up.
+// The right name for the two functions on intrinsics would match Module::,
+// but doing that in a single change would introduce nullptr dereferences
+// where currently there are none. The minimal collateral damage approach
+// would split the change over a release to help downstream branches. As it
+// is unclear what approach will be preferred, implementing the trivial
+// function here in the meantime to decouple from that discussion.
+Function *getPreexistingDeclaration(Module *M, Intrinsic::ID Id,
+ ArrayRef<Type *> Tys = {}) {
+ auto *FT = Intrinsic::getType(M->getContext(), Id, Tys);
+ return M->getFunction(Tys.empty() ? Intrinsic::getName(Id)
+ : Intrinsic::getName(Id, Tys, M, FT));
+}
+
+class ExpandVariadics : public ModulePass {
+
+ // The pass construction sets the default to optimize when called from middle
+ // end and lowering when called from the backend. The command line variable
+ // overrides that. This is useful for testing and debugging. It also allows
+ // building an applications with variadic functions wholly removed if one
+ // has sufficient control over the dependencies, e.g. a statically linked
+ // clang that has no variadic function calls remaining in the binary.
+
+public:
+ static char ID;
+ const ExpandVariadicsMode Mode;
+ std::unique_ptr<VariadicABIInfo> ABI;
+
+ ExpandVariadics(ExpandVariadicsMode Mode)
+ : ModulePass(ID),
+ Mode(commandLineOverride() ? ExpandVariadicsModeOption : Mode) {}
+
+ StringRef getPassName() const override { return "Expand variadic functions"; }
+
+ bool rewriteABI() { return Mode == ExpandVariadicsMode::Lowering; }
+
+ bool runOnModule(Module &M) override;
+
+ bool runOnFunction(Module &M, IRBuilder<> &Builder, Function *F);
+
+ Function *replaceAllUsesWithNewDeclaration(Module &M,
+ Function *OriginalFunction);
+
+ Function *deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder,
+ Function *OriginalFunction);
+
+ Function *defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
+ Function *VariadicWrapper,
+ Function *FixedArityReplacement);
+
+ bool expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, FunctionType *,
+ Function *NF);
+
+ // The intrinsic functions va_copy and va_end are removed unconditionally.
+ // They correspond to a memcpy and a no-op on all implemented targets.
+ // The va_start intrinsic is removed from basic blocks that were not created
+ // by this pass, some may remain if needed to maintain the external ABI.
+
+ template <Intrinsic::ID ID, typename InstructionType>
+ bool expandIntrinsicUsers(Module &M, IRBuilder<> &Builder,
+ PointerType *IntrinsicArgType) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+ if (Function *Intrinsic =
+ getPreexistingDeclaration(&M, ID, {IntrinsicArgType})) {
+ for (User *U : make_early_inc_range(Intrinsic->users()))
+ if (auto *I = dyn_cast<InstructionType>(U))
+ Changed |= expandVAIntrinsicCall(Builder, DL, I);
+
+ if (Intrinsic->use_empty())
+ Intrinsic->eraseFromParent();
+ }
+ return Changed;
+ }
+
+ bool expandVAIntrinsicUsersWithAddrspace(Module &M, IRBuilder<> &Builder,
+ unsigned Addrspace) {
+ auto &Ctx = M.getContext();
+ PointerType *IntrinsicArgType = PointerType::get(Ctx, Addrspace);
+ bool Changed = false;
+
+ // expand vastart before vacopy as vastart may introduce a vacopy
+ Changed |= expandIntrinsicUsers<Intrinsic::vastart, VAStartInst>(
+ M, Builder, IntrinsicArgType);
+ Changed |= expandIntrinsicUsers<Intrinsic::vaend, VAEndInst>(
+ M, Builder, IntrinsicArgType);
+ Changed |= expandIntrinsicUsers<Intrinsic::vacopy, VACopyInst>(
+ M, Builder, IntrinsicArgType);
+ return Changed;
+ }
+
+ bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+ VAStartInst *Inst);
+
+ bool expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+ VAEndInst *Inst);
+
+ bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+ VACopyInst *Inst);
+
+ FunctionType *inlinableVariadicFunctionType(Module &M, FunctionType *FTy) {
+ // The type of "FTy" with the ... removed and a va_list appended
+ SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+ ArgTypes.push_back(ABI->vaListParameterType(M));
+ return FunctionType::get(FTy->getReturnType(), ArgTypes,
+ /*IsVarArgs=*/false);
+ }
+
+ static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
+ AllocaInst *Alloced) {
+ std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL);
+ uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0;
+ return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
+ }
+
+ bool expansionApplicableToFunction(Module &M, Function *F) {
+ if (F->isIntrinsic() || !F->isVarArg() ||
+ F->hasFnAttribute(Attribute::Naked))
+ return false;
+
+ if (F->getCallingConv() != CallingConv::C)
+ return false;
+
+ if (rewriteABI())
+ return true;
+
+ if (!F->hasExactDefinition())
+ return false;
+
+ return true;
+ }
+
+ bool expansionApplicableToFunctionCall(CallBase *CB) {
+ if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+ if (CI->isMustTailCall()) {
+ // Cannot expand musttail calls
+ return false;
+ }
+
+ if (CI->getCallingConv() != CallingConv::C)
+ return false;
+
+ return true;
+ }
+
+ if (isa<InvokeInst>(CB)) {
+ // Invoke not implemented in initial implementation of pass
+ return false;
+ }
+
+ // Other unimplemented derivative of CallBase
+ return false;
+ }
+
+ class ExpandedCallFrame {
+ // Helper for constructing an alloca instance containing the arguments bound
+ // to the variadic ... parameter, rearranged to allow indexing through a
+ // va_list iterator
+ enum { N = 4 };
+ SmallVector<Type *, N> FieldTypes;
+ enum Tag { Store, Memcpy, Padding };
+ SmallVector<std::tuple<Value *, uint64_t, Tag>, N> Source;
+
+ template <Tag tag> void append(Type *FieldType, Value *V, uint64_t Bytes) {
+ FieldTypes.push_back(FieldType);
+ Source.push_back({V, Bytes, tag});
+ }
+
+ public:
+ void store(LLVMContext &Ctx, Type *T, Value *V) { append<Store>(T, V, 0); }
+
+ void memcpy(LLVMContext &Ctx, Type *T, Value *V, uint64_t Bytes) {
+ append<Memcpy>(T, V, Bytes);
+ }
+
+ void padding(LLVMContext &Ctx, uint64_t By) {
+ append<Padding>(ArrayType::get(Type::getInt8Ty(Ctx), By), nullptr, 0);
+ }
+
+ size_t size() const { return FieldTypes.size(); }
+ bool empty() const { return FieldTypes.empty(); }
+
+ StructType *asStruct(LLVMContext &Ctx, StringRef Name) {
+ const bool IsPacked = true;
+ return StructType::create(Ctx, FieldTypes,
+ (Twine(Name) + ".vararg").str(), IsPacked);
+ }
+
+ void initializeStructAlloca(const DataLayout &DL, IRBuilder<> &Builder,
+ AllocaInst *Alloced) {
+
+ StructType *VarargsTy = cast<StructType>(Alloced->getAllocatedType());
+
+ for (size_t I = 0; I < size(); I++) {
+
+ auto [V, bytes, tag] = Source[I];
+
+ if (tag == Padding) {
+ assert(V == nullptr);
+ continue;
+ }
+
+ auto Dst = Builder.CreateStructGEP(VarargsTy, Alloced, I);
+
+ assert(V != nullptr);
+
+ if (tag == Store)
+ Builder.CreateStore(V, Dst);
+
+ if (tag == Memcpy)
+ Builder.CreateMemCpy(Dst, {}, V, {}, bytes);
+ }
+ }
+ };
+};
+
+bool ExpandVariadics::runOnModule(Module &M) {
+ bool Changed = false;
+ if (Mode == ExpandVariadicsMode::Disable)
+ return Changed;
+
+ Triple TT(M.getTargetTriple());
+ ABI = VariadicABIInfo::create(TT);
+ if (!ABI)
+ return Changed;
+
+ if (!ABI->enableForTarget())
+ return Changed;
+
+ auto &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
+ IRBuilder<> Builder(Ctx);
+
+ // Lowering needs to run on all functions exactly once.
+ // Optimize could run on functions containing va_start exactly once.
+ for (Function &F : make_early_inc_range(M))
+ Changed |= runOnFunction(M, Builder, &F);
+
+ // After runOnFunction, all known calls to known variadic functions have been
+ // replaced. va_start intrinsics are presently (and invalidly!) only present
+ // in functions that used to be variadic and have now been replaced to take a
+ // va_list instead. If lowering as opposed to optimising, calls to unknown
+ // variadic functions have also been replaced.
+
+ {
+ // 0 and AllocaAddrSpace are sufficient for the targets implemented so far
+ unsigned Addrspace = 0;
+ Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace);
+
+ Addrspace = DL.getAllocaAddrSpace();
+ if (Addrspace != 0)
+ Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace);
+ }
+
+ if (Mode != ExpandVariadicsMode::Lowering)
+ return Changed;
+
+ for (Function &F : make_early_inc_range(M)) {
+ if (F.isDeclaration())
+ continue;
+
+ // Now need to track down indirect calls. Can't find those
+ // by walking uses of variadic functions, need to crawl the instruction
+ // stream. Fortunately this is only necessary for the ABI rewrite case.
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : make_early_inc_range(BB)) {
+ if (CallBase *CB = dyn_cast<CallBase>(&I)) {
+ if (CB->isIndirectCall()) {
+ FunctionType *FTy = CB->getFunctionType();
+ if (FTy->isVarArg())
+ Changed |= expandCall(M, Builder, CB, FTy, 0);
+ }
+ }
+ }
+ }
+ }
+
+ return Changed;
+}
+
+bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
+ Function *OriginalFunction) {
+ bool Changed = false;
+
+ if (!expansionApplicableToFunction(M, OriginalFunction))
+ return Changed;
+
+ [[maybe_unused]] const bool OriginalFunctionIsDeclaration =
+ OriginalFunction->isDeclaration();
+ assert(rewriteABI() || !OriginalFunctionIsDeclaration);
+
+ // Declare a new function and redirect every use to that new function
+ Function *VariadicWrapper =
+ replaceAllUsesWithNewDeclaration(M, OriginalFunction);
+ assert(VariadicWrapper->isDeclaration());
+ assert(OriginalFunction->use_empty());
+
+ // Create a new function taking va_list containing the implementation of the
+ // original
+ Function *FixedArityReplacement =
+ deriveFixedArityReplacement(M, Builder, OriginalFunction);
+ assert(OriginalFunction->isDeclaration());
+ assert(FixedArityReplacement->isDeclaration() ==
+ OriginalFunctionIsDeclaration);
+ assert(VariadicWrapper->isDeclaration());
+
+ // Create a single block forwarding wrapper that turns a ... into a va_list
+ [[maybe_unused]] Function *VariadicWrapperDefine =
+ defineVariadicWrapper(M, Builder, VariadicWrapper, FixedArityReplacement);
+ assert(VariadicWrapperDefine == VariadicWrapper);
+ assert(!VariadicWrapper->isDeclaration());
+
+ // We now have:
+ // 1. the original function, now as a declaration with no uses
+ // 2. a variadic function that unconditionally calls a fixed arity replacement
+ // 3. a fixed arity function equivalent to the original function
+
+ // Replace known calls to the variadic with calls to the va_list equivalent
+ for (User *U : make_early_inc_range(VariadicWrapper->users())) {
+ if (CallBase *CB = dyn_cast<CallBase>(U)) {
+ Value *calledOperand = CB->getCalledOperand();
+ if (VariadicWrapper == calledOperand)
+ Changed |=
+ expandCall(M, Builder, CB, VariadicWrapper->getFunctionType(),
+ FixedArityReplacement);
+ }
+ }
+
+ // The original function will be erased.
+ // One of the two new functions will become a replacement for the original.
+ // When preserving the ABI, the other is an internal implementation detail.
+ // When rewriting the ABI, RAUW then the variadic one.
+ Function *const ExternallyAccessible =
+ rewriteABI() ? FixedArityReplacement : VariadicWrapper;
+ Function *const InternalOnly =
+ rewriteABI() ? VariadicWrapper : FixedArityReplacement;
+
+ // The external function is the replacement for the original
+ ExternallyAccessible->setLinkage(OriginalFunction->getLinkage());
+ ExternallyAccessible->setVisibility(OriginalFunction->getVisibility());
+ ExternallyAccessible->setComdat(OriginalFunction->getComdat());
+ ExternallyAccessible->takeName(OriginalFunction);
+
+ // Annotate the internal one as internal
+ InternalOnly->setVisibility(GlobalValue::DefaultVisibility);
+ InternalOnly->setLinkage(GlobalValue::InternalLinkage);
+
+ // The original is unused and obsolete
+ OriginalFunction->eraseFromParent();
+
+ InternalOnly->removeDeadConstantUsers();
+
+ if (rewriteABI()) {
+ // All known calls to the function have been removed by expandCall
+ // Resolve everything else by replaceAllUsesWith
+ VariadicWrapper->replaceAllUsesWith(FixedArityReplacement);
+ VariadicWrapper->eraseFromParent();
+ }
+
+ return Changed;
+}
+
+Function *
+ExpandVariadics::replaceAllUsesWithNewDeclaration(Module &M,
+ Function *OriginalFunction) {
+ auto &Ctx = M.getContext();
+ Function &F = *OriginalFunction;
+ FunctionType *FTy = F.getFunctionType();
+ Function *NF = Function::Create(FTy, F.getLinkage(), F.getAddressSpace());
+
+ NF->setName(F.getName() + ".varargs");
+ NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
+
+ F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+
+ AttrBuilder ParamAttrs(Ctx);
+ AttributeList Attrs = NF->getAttributes();
+ Attrs = Attrs.addParamAttributes(Ctx, FTy->getNumParams(), ParamAttrs);
+ NF->setAttributes(Attrs);
+
+ OriginalFunction->replaceAllUsesWith(NF);
+ return NF;
+}
+
+Function *
+ExpandVariadics::deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder,
+ Function *OriginalFunction) {
+ Function &F = *OriginalFunction;
+ // The purpose here is split the variadic function F into two functions
+ // One is a variadic function that bundles the passed argument into a va_list
+ // and passes it to the second function. The second function does whatever
+ // the original F does, except that it takes a va_list instead of the ...
+
+ assert(expansionApplicableToFunction(M, &F));
+
+ auto &Ctx = M.getContext();
+
+ // Returned value isDeclaration() is equal to F.isDeclaration()
+ // but that property is not invariant throughout this function
+ const bool FunctionIsDefinition = !F.isDeclaration();
+
+ FunctionType *FTy = F.getFunctionType();
+ SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+ ArgTypes.push_back(ABI->vaListParameterType(M));
+
+ FunctionType *NFTy = inlinableVariadicFunctionType(M, FTy);
+ Function *NF = Function::Create(NFTy, F.getLinkage(), F.getAddressSpace());
+
+ // Note - same attribute handling as DeadArgumentElimination
+ NF->copyAttributesFrom(&F);
+ NF->setComdat(F.getComdat());
+ F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+ NF->setName(F.getName() + ".valist");
+ NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
+
+ AttrBuilder ParamAttrs(Ctx);
+
+ AttributeList Attrs = NF->getAttributes();
+ Attrs = Attrs.addParamAttributes(Ctx, NFTy->getNumParams() - 1, ParamAttrs);
+ NF->setAttributes(Attrs);
+
+ // Splice the implementation into the new function with minimal changes
+ if (FunctionIsDefinition) {
+ NF->splice(NF->begin(), &F);
+
+ auto NewArg = NF->arg_begin();
+ for (Argument &Arg : F.args()) {
+ Arg.replaceAllUsesWith(NewArg);
+ NewArg->setName(Arg.getName()); // takeName without killing the old one
+ ++NewArg;
+ }
+ NewArg->setName("varargs");
+ }
+
+ SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+ F.getAllMetadata(MDs);
+ for (auto [KindID, Node] : MDs)
+ NF->addMetadata(KindID, *Node);
+ F.clearMetadata();
+
+ return NF;
+}
+
+Function *
+ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
+ Function *VariadicWrapper,
+ Function *FixedArityReplacement) {
+ auto &Ctx = Builder.getContext();
+ const DataLayout &DL = M.getDataLayout();
+ assert(VariadicWrapper->isDeclaration());
+ Function &F = *VariadicWrapper;
+
+ assert(F.isDeclaration());
+ Type *VaListTy = ABI->vaListType(Ctx);
+
+ auto *BB = BasicBlock::Create(Ctx, "entry", &F);
+ Builder.SetInsertPoint(BB);
+
+ AllocaInst *VaListInstance =
+ Builder.CreateAlloca(VaListTy, nullptr, "va_start");
+
+ Builder.CreateLifetimeStart(VaListInstance,
+ sizeOfAlloca(Ctx, DL, VaListInstance));
+
+ Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)},
+ {VaListInstance});
+
+ SmallVector<Value *> Args;
+ for (Argument &A : F.args())
+ Args.push_back(&A);
+
+ Type *ParameterType = ABI->vaListParameterType(M);
+ if (ABI->vaListPassedInSSARegister())
+ Args.push_back(Builder.CreateLoad(ParameterType, VaListInstance));
+ else
+ Args.push_back(Builder.CreateAddrSpaceCast(VaListInstance, ParameterType));
+
+ CallInst *Result = Builder.CreateCall(FixedArityReplacement, Args);
+
+ Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)},
+ {VaListInstance});
+ Builder.CreateLifetimeEnd(VaListInstance,
+ sizeOfAlloca(Ctx, DL, VaListInstance));
+
+ if (Result->getType()->isVoidTy())
+ Builder.CreateRetVoid();
+ else
+ Builder.CreateRet(Result);
+
+ return VariadicWrapper;
+}
+
+bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
+ FunctionType *VarargFunctionType,
+ Function *NF) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+
+ if (!expansionApplicableToFunctionCall(CB)) {
+ if (rewriteABI())
+ report_fatal_error("Cannot lower callbase instruction");
+ return Changed;
+ }
+
+ // This is tricky. The call instruction's function type might not match
+ // the type of the caller. When optimising, can leave it unchanged.
+ // Webassembly detects that inconsistency and repairs it.
+ FunctionType *FuncType = CB->getFunctionType();
+ if (FuncType != VarargFunctionType) {
+ if (!rewriteABI())
+ return Changed;
+ FuncType = VarargFunctionType;
+ }
+
+ auto &Ctx = CB->getContext();
+
+ Align MaxFieldAlign(1);
+
+ // The strategy is to allocate a call frame containing the variadic
+ // arguments laid out such that a target specific va_list can be initialized
+ // with it, such that target specific va_arg instructions will correctly
+ // iterate over it. This means getting the alignment right and sometimes
+ // embedding a pointer to the value instead of embedding the value itself.
+
+ Function *CBF = CB->getParent()->getParent();
+
+ ExpandedCallFrame Frame;
+
+ uint64_t CurrentOffset = 0;
+
+ for (unsigned I = FuncType->getNumParams(), E = CB->arg_size(); I < E; ++I) {
+ Value *ArgVal = CB->getArgOperand(I);
+ const bool IsByVal = CB->paramHasAttr(I, Attribute::ByVal);
+ const bool IsByRef = CB->paramHasAttr(I, Attribute::ByRef);
+
+ // The type of the value being passed, decoded from byval/byref metadata if
+ // required
+ Type *const UnderlyingType = IsByVal ? CB->getParamByValType(I)
+ : IsByRef ? CB->getParamByRefType(I)
+ : ArgVal->getType();
+ const uint64_t UnderlyingSize =
+ DL.getTypeAllocSize(UnderlyingType).getFixedValue();
+
+ // The type to be written into the call frame
+ Type *FrameFieldType = UnderlyingType;
+
+ // The value to copy from when initialising the frame alloca
+ Value *SourceValue = ArgVal;
+
+ VariadicABIInfo::VAArgSlotInfo SlotInfo = ABI->slotInfo(DL, UnderlyingType);
+
+ if (SlotInfo.Indirect) {
+ // The va_arg lowering loads through a pointer. Set up an alloca to aim
+ // that pointer at.
+ Builder.SetInsertPointPastAllocas(CBF);
+ Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
+ Value *CallerCopy =
+ Builder.CreateAlloca(UnderlyingType, nullptr, "IndirectAlloca");
+
+ Builder.SetInsertPoint(CB);
+ if (IsByVal)
+ Builder.CreateMemCpy(CallerCopy, {}, ArgVal, {}, UnderlyingSize);
+ else
+ Builder.CreateStore(ArgVal, CallerCopy);
+
+ // Indirection now handled, pass the alloca ptr by value
+ FrameFieldType = DL.getAllocaPtrType(Ctx);
+ SourceValue = CallerCopy;
+ }
+
+ // Alignment of the value within the frame
+ // This probably needs to be controllable as a function of type
+ Align DataAlign = SlotInfo.DataAlign;
+
+ MaxFieldAlign = std::max(MaxFieldAlign, DataAlign);
+
+ uint64_t DataAlignV = DataAlign.value();
+ if (uint64_t Rem = CurrentOffset % DataAlignV) {
+ // Inject explicit padding to deal with alignment requirements
+ uint64_t Padding = DataAlignV - Rem;
+ Frame.padding(Ctx, Padding);
+ CurrentOffset += Padding;
+ }
+
+ if (SlotInfo.Indirect) {
+ Frame.store(Ctx, FrameFieldType, SourceValue);
+ } else {
+ if (IsByVal)
+ Frame.memcpy(Ctx, FrameFieldType, SourceValue, UnderlyingSize);
+ else
+ Frame.store(Ctx, FrameFieldType, SourceValue);
+ }
+
+ CurrentOffset += DL.getTypeAllocSize(FrameFieldType).getFixedValue();
+ }
+
+ if (Frame.empty()) {
+ // Not passing any arguments, hopefully va_arg won't try to read any
+ // Creating a single byte frame containing nothing to point the va_list
+ // instance as that is less special-casey in the compiler and probably
+ // easier to interpret in a debugger.
+ Frame.padding(Ctx, 1);
+ }
+
+ StructType *VarargsTy = Frame.asStruct(Ctx, CBF->getName());
+
+ // The struct instance needs to be at least MaxFieldAlign for the alignment of
+ // the fields to be correct at runtime. Use the native stack alignment instead
+ // if that's greater as that tends to give better codegen.
+ // This is an awkward way to guess whether there is a known stack alignment
+ // without hitting an assert in DL.getStackAlignment, 1024 is an arbitrary
+ // number likely to be greater than the natural stack alignment.
+ // TODO: DL.getStackAlignment could return a MaybeAlign instead of assert
+ Align AllocaAlign = MaxFieldAlign;
+ if (DL.exceedsNaturalStackAlignment(Align(1024)))
+ AllocaAlign = std::max(AllocaAlign, DL.getStackAlignment());
+
+ // Put the alloca to hold the variadic args in the entry basic block.
+ Builder.SetInsertPointPastAllocas(CBF);
+
+ // SetCurrentDebugLocation when the builder SetInsertPoint method does not
+ Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
+
+ // The awkward construction here is to set the alignment on the instance
+ AllocaInst *Alloced = Builder.Insert(
+ new AllocaInst(VarargsTy, DL.getAllocaAddrSpace(), nullptr, AllocaAlign),
+ "vararg_buffer");
+ Changed = true;
+ assert(Alloced->getAllocatedType() == VarargsTy);
+
+ // Initialize the fields in the struct
+ Builder.SetInsertPoint(CB);
+ Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+ Frame.initializeStructAlloca(DL, Builder, Alloced);
+
+ const unsigned NumArgs = FuncType->getNumParams();
+ SmallVector<Value *> Args(CB->arg_begin(), CB->arg_begin() + NumArgs);
+
+ // Initialize a va_list pointing to that struct and pass it as the last
+ // argument
+ AllocaInst *VaList = nullptr;
+ {
+ if (!ABI->vaListPassedInSSARegister()) {
+ Type *VaListTy = ABI->vaListType(Ctx);
+ Builder.SetInsertPointPastAllocas(CBF);
+ Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
+ VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument");
+ Builder.SetInsertPoint(CB);
+ Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList));
+ }
+ Builder.SetInsertPoint(CB);
+ Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced));
+ }
+
+ // Attributes excluding any on the vararg arguments
+ AttributeList PAL = CB->getAttributes();
+ if (!PAL.isEmpty()) {
+ SmallVector<AttributeSet, 8> ArgAttrs;
+ for (unsigned ArgNo = 0; ArgNo < NumArgs; ArgNo++)
+ ArgAttrs.push_back(PAL.getParamAttrs(ArgNo));
+ PAL =
+ AttributeList::get(Ctx, PAL.getFnAttrs(), PAL.getRetAttrs(), ArgAttrs);
+ }
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CB->getOperandBundlesAsDefs(OpBundles);
+
+ CallBase *NewCB = nullptr;
+
+ if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+ Value *Dst = NF ? NF : CI->getCalledOperand();
+ FunctionType *NFTy = inlinableVariadicFunctionType(M, VarargFunctionType);
+
+ NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI);
+
+ CallInst::TailCallKind TCK = CI->getTailCallKind();
+ assert(TCK != CallInst::TCK_MustTail);
+
+ // Can't tail call a function that is being passed a pointer to an alloca
+ if (TCK == CallInst::TCK_Tail)
+ TCK = CallInst::TCK_None;
+ CI->setTailCallKind(TCK);
+
+ } else {
+ llvm_unreachable("Unreachable when !expansionApplicableToFunctionCall()");
+ }
+
+ if (VaList)
+ Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList));
+
+ Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+
+ NewCB->setAttributes(PAL);
+ NewCB->takeName(CB);
+ NewCB->setCallingConv(CB->getCallingConv());
+ NewCB->setDebugLoc(DebugLoc());
+
+ // DeadArgElim and ArgPromotion copy exactly this metadata
+ NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+
+ CB->replaceAllUsesWith(NewCB);
+ CB->eraseFromParent();
+ return Changed;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
+ const DataLayout &DL,
+ VAStartInst *Inst) {
+ // Only removing va_start instructions that are not in variadic functions.
+ // Those would be rejected by the IR verifier before this pass.
+ // After splicing basic blocks from a variadic function into a fixed arity
+ // one the va_start that used to refer to the ... parameter still exist.
+ // There are also variadic functions that this pass did not change and
+ // va_start instances in the created single block wrapper functions.
+ // Replace exactly the instances in non-variadic functions as those are
+ // the ones to be fixed up to use the va_list passed as the final argument.
+
+ Function *ContainingFunction = Inst->getFunction();
+ if (ContainingFunction->isVarArg()) {
+ return false;
+ }
+
+ // The last argument is a vaListParameterType, either a va_list
+ // or a pointer to one depending on the target.
+ bool PassedByValue = ABI->vaListPassedInSSARegister();
+ Argument *PassedVaList =
+ ContainingFunction->getArg(ContainingFunction->arg_size() - 1);
+
+ // va_start takes a pointer to a va_list, e.g. one on the stack
+ Value *VaStartArg = Inst->getArgList();
+
+ Builder.SetInsertPoint(Inst);
+
+ if (PassedByValue) {
+ // The general thing to do is create an alloca, store the va_list argument
+ // to it, then create a va_copy. When vaCopyIsMemcpy(), this optimises to a
+ // store to the VaStartArg.
+ assert(ABI->vaCopyIsMemcpy());
+ Builder.CreateStore(PassedVaList, VaStartArg);
+ } else {
+
+ // Otherwise emit a vacopy to pick up target-specific handling if any
+ auto &Ctx = Builder.getContext();
+
+ Builder.CreateIntrinsic(Intrinsic::vacopy, {DL.getAllocaPtrType(Ctx)},
+ {VaStartArg, PassedVaList});
+ }
+
+ Inst->eraseFromParent();
+ return true;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+ VAEndInst *Inst) {
+ assert(ABI->vaEndIsNop());
+ Inst->eraseFromParent();
+ return true;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
+ const DataLayout &DL,
+ VACopyInst *Inst) {
+ assert(ABI->vaCopyIsMemcpy());
+ Builder.SetInsertPoint(Inst);
+
+ auto &Ctx = Builder.getContext();
+ Type *VaListTy = ABI->vaListType(Ctx);
+ uint64_t Size = DL.getTypeAllocSize(VaListTy).getFixedValue();
+
+ Builder.CreateMemCpy(Inst->getDest(), {}, Inst->getSrc(), {},
+ Builder.getInt32(Size));
+
+ Inst->eraseFromParent();
+ return true;
+}
+
+struct Amdgpu final : public VariadicABIInfo {
+
+ bool enableForTarget() override { return true; }
+
+ bool vaListPassedInSSARegister() override { return true; }
+
+ Type *vaListType(LLVMContext &Ctx) override {
+ return PointerType::getUnqual(Ctx);
+ }
+
+ Type *vaListParameterType(Module &M) override {
+ return PointerType::getUnqual(M.getContext());
+ }
+
+ Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder,
+ AllocaInst * /*va_list*/, Value *Buffer) override {
+ // Given Buffer, which is an AllocInst of vararg_buffer
+ // need to return something usable as parameter type
+ return Builder.CreateAddrSpaceCast(Buffer, vaListParameterType(M));
+ }
+
+ VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override {
+ return {Align(4), false};
+ }
+};
+
+struct Wasm final : public VariadicABIInfo {
+
+ bool enableForTarget() override {
+ // Currently wasm is only used for testing.
+ return commandLineOverride();
+ }
+
+ bool vaListPassedInSSARegister() override { return true; }
+
+ Type *vaListType(LLVMContext &Ctx) override {
+ return PointerType::getUnqual(Ctx);
+ }
+
+ Type *vaListParameterType(Module &M) override {
+ return PointerType::getUnqual(M.getContext());
+ }
+
+ Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder,
+ AllocaInst * /*va_list*/, Value *Buffer) override {
+ return Buffer;
+ }
+
+ VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override {
+ LLVMContext &Ctx = Parameter->getContext();
+ const unsigned MinAlign = 4;
+ Align A = DL.getABITypeAlign(Parameter);
+ if (A < MinAlign)
+ A = Align(MinAlign);
+
+ if (auto s = dyn_cast<StructType>(Parameter)) {
+ if (s->getNumElements() > 1) {
+ return {DL.getABITypeAlign(PointerType::getUnqual(Ctx)), true};
+ }
+ }
+
+ return {A, false};
+ }
+};
+
+std::unique_ptr<VariadicABIInfo> VariadicABIInfo::create(const Triple &T) {
+ switch (T.getArch()) {
+ case Triple::r600:
+ case Triple::amdgcn: {
+ return std::make_unique<Amdgpu>();
+ }
+
+ case Triple::wasm32: {
+ return std::make_unique<Wasm>();
+ }
+
+ default:
+ return {};
+ }
+}
+
+} // namespace
+
+char ExpandVariadics::ID = 0;
+
+INITIALIZE_PASS(ExpandVariadics, DEBUG_TYPE, "Expand variadic functions", false,
+ false)
+
+ModulePass *llvm::createExpandVariadicsPass(ExpandVariadicsMode M) {
+ return new ExpandVariadics(M);
+}
+
+PreservedAnalyses ExpandVariadicsPass::run(Module &M, ModuleAnalysisManager &) {
+ return ExpandVariadics(Mode).runOnModule(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
+
+ExpandVariadicsPass::ExpandVariadicsPass(ExpandVariadicsMode M) : Mode(M) {}
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 03923b83cf34..f033d2b0d6d0 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -262,8 +262,70 @@ public:
// TODO: Should this be a map (from Caller node) for more efficient lookup?
std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
- // The set of IDs for contexts including this node.
- DenseSet<uint32_t> ContextIds;
+ // Get the list of edges from which we can compute allocation information
+ // such as the context ids and allocation type of this node.
+ const std::vector<std::shared_ptr<ContextEdge>> *
+ getEdgesWithAllocInfo() const {
+ // If node has any callees, compute from those, otherwise compute from
+ // callers (i.e. if this is the leaf allocation node).
+ if (!CalleeEdges.empty())
+ return &CalleeEdges;
+ if (!CallerEdges.empty()) {
+ // A node with caller edges but no callee edges must be the allocation
+ // node.
+ assert(IsAllocation);
+ return &CallerEdges;
+ }
+ return nullptr;
+ }
+
+ // Compute the context ids for this node from the union of its edge context
+ // ids.
+ DenseSet<uint32_t> getContextIds() const {
+ DenseSet<uint32_t> ContextIds;
+ auto *Edges = getEdgesWithAllocInfo();
+ if (!Edges)
+ return {};
+ unsigned Count = 0;
+ for (auto &Edge : *Edges)
+ Count += Edge->getContextIds().size();
+ ContextIds.reserve(Count);
+ for (auto &Edge : *Edges)
+ ContextIds.insert(Edge->getContextIds().begin(),
+ Edge->getContextIds().end());
+ return ContextIds;
+ }
+
+ // Compute the allocation type for this node from the OR of its edge
+ // allocation types.
+ uint8_t computeAllocType() const {
+ auto *Edges = getEdgesWithAllocInfo();
+ if (!Edges)
+ return (uint8_t)AllocationType::None;
+ uint8_t BothTypes =
+ (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
+ uint8_t AllocType = (uint8_t)AllocationType::None;
+ for (auto &Edge : *Edges) {
+ AllocType |= Edge->AllocTypes;
+ // Bail early if alloc type reached both, no further refinement.
+ if (AllocType == BothTypes)
+ return AllocType;
+ }
+ return AllocType;
+ }
+
+ // The context ids set for this node is empty if its edge context ids are
+ // also all empty.
+ bool emptyContextIds() const {
+ auto *Edges = getEdgesWithAllocInfo();
+ if (!Edges)
+ return true;
+ for (auto &Edge : *Edges) {
+ if (!Edge->getContextIds().empty())
+ return false;
+ }
+ return true;
+ }
// List of clones of this ContextNode, initially empty.
std::vector<ContextNode *> Clones;
@@ -308,15 +370,11 @@ public:
void printCall(raw_ostream &OS) const { Call.print(OS); }
// True if this node was effectively removed from the graph, in which case
- // its context id set, caller edges, and callee edges should all be empty.
+ // it should have an allocation type of None and empty context ids.
bool isRemoved() const {
- // Note that we can have non-empty context ids with empty caller and
- // callee edges if the graph ends up with a single node.
- if (ContextIds.empty())
- assert(CalleeEdges.empty() && CallerEdges.empty() &&
- "Context ids empty but at least one of callee and caller edges "
- "were not!");
- return ContextIds.empty();
+ assert((AllocTypes == (uint8_t)AllocationType::None) ==
+ emptyContextIds());
+ return AllocTypes == (uint8_t)AllocationType::None;
}
void dump() const;
@@ -429,7 +487,8 @@ private:
/// else to its callers. Also updates OrigNode's edges to remove any context
/// ids moved to the newly created edge.
void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
- bool TowardsCallee);
+ bool TowardsCallee,
+ DenseSet<uint32_t> RemainingContextIds);
/// Get the stack id corresponding to the given Id or Index (for IR this will
/// return itself, for a summary index this will return the id recorded in the
@@ -958,7 +1017,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
// Update alloc type and context ids for this MIB.
AllocNode->AllocTypes |= (uint8_t)AllocType;
- AllocNode->ContextIds.insert(LastContextId);
// Now add or update nodes for each stack id in alloc's context.
// Later when processing the stack ids on non-alloc callsites we will adjust
@@ -983,7 +1041,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
auto Ins = StackIdSet.insert(StackId);
if (!Ins.second)
StackNode->Recursive = true;
- StackNode->ContextIds.insert(LastContextId);
StackNode->AllocTypes |= (uint8_t)AllocType;
PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
PrevNode = StackNode;
@@ -1034,7 +1091,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
// it resulted in any added ids to NextNode.
if (!NewIdsToAdd.empty()) {
Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
- NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
UpdateCallers(NextNode, Visited, UpdateCallers);
}
}
@@ -1043,21 +1099,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
DenseSet<const ContextEdge *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap) {
auto *Node = Entry.second;
- // Update ids on the allocation nodes before calling the recursive
- // update along caller edges, since this simplifies the logic during
- // that traversal.
- DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
- Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
UpdateCallers(Node, Visited, UpdateCallers);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
- ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
- // Make a copy of the context ids, since this will be adjusted below as they
- // are moved.
- DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
+ ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
+ // This must be passed by value to make a copy since it will be adjusted
+ // as ids are moved.
+ DenseSet<uint32_t> RemainingContextIds) {
auto &OrigEdges =
TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
// Increment iterator in loop so that we can remove edges as needed.
@@ -1104,6 +1155,51 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
+static void checkEdge(
+ const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
+ // Confirm that alloc type is not None and that we have at least one context
+ // id.
+ assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
+ assert(!Edge->ContextIds.empty());
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
+ bool CheckEdges = true) {
+ if (Node->isRemoved())
+ return;
+#ifndef NDEBUG
+ // Compute node's context ids once for use in asserts.
+ auto NodeContextIds = Node->getContextIds();
+#endif
+ // Node's context ids should be the union of both its callee and caller edge
+ // context ids.
+ if (Node->CallerEdges.size()) {
+ DenseSet<uint32_t> CallerEdgeContextIds(
+ Node->CallerEdges.front()->ContextIds);
+ for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
+ if (CheckEdges)
+ checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
+ set_union(CallerEdgeContextIds, Edge->ContextIds);
+ }
+ // Node can have more context ids than callers if some contexts terminate at
+ // node and some are longer.
+ assert(NodeContextIds == CallerEdgeContextIds ||
+ set_is_subset(CallerEdgeContextIds, NodeContextIds));
+ }
+ if (Node->CalleeEdges.size()) {
+ DenseSet<uint32_t> CalleeEdgeContextIds(
+ Node->CalleeEdges.front()->ContextIds);
+ for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
+ if (CheckEdges)
+ checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
+ set_union(CalleeEdgeContextIds, Edge->getContextIds());
+ }
+ assert(NodeContextIds == CalleeEdgeContextIds);
+ }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
assignStackNodesPostOrder(ContextNode *Node,
DenseSet<const ContextNode *> &Visited,
@@ -1178,7 +1274,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
// duplicated context ids. We have to recompute as we might have overlap
// overlap between the saved context ids for different last nodes, and
// removed them already during the post order traversal.
- set_intersect(SavedContextIds, FirstNode->ContextIds);
+ set_intersect(SavedContextIds, FirstNode->getContextIds());
ContextNode *PrevNode = nullptr;
for (auto Id : Ids) {
ContextNode *CurNode = getNodeForStackId(Id);
@@ -1211,18 +1307,17 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
ContextNode *NewNode = NodeOwner.back().get();
NodeToCallingFunc[NewNode] = Func;
NonAllocationCallToContextNodeMap[Call] = NewNode;
- NewNode->ContextIds = SavedContextIds;
- NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
+ NewNode->AllocTypes = computeAllocType(SavedContextIds);
// Connect to callees of innermost stack frame in inlined call chain.
// This updates context ids for FirstNode's callee's to reflect those
// moved to NewNode.
- connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
+ connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
// Connect to callers of outermost stack frame in inlined call chain.
// This updates context ids for FirstNode's caller's to reflect those
// moved to NewNode.
- connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
+ connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
// Now we need to remove context ids from edges/nodes between First and
// Last Node.
@@ -1234,18 +1329,32 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
// Remove the context ids moved to NewNode from CurNode, and the
// edge from the prior node.
- set_subtract(CurNode->ContextIds, NewNode->ContextIds);
if (PrevNode) {
auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
assert(PrevEdge);
- set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
+ set_subtract(PrevEdge->getContextIds(), SavedContextIds);
if (PrevEdge->getContextIds().empty()) {
PrevNode->eraseCallerEdge(PrevEdge);
CurNode->eraseCalleeEdge(PrevEdge);
}
}
+ // Since we update the edges from leaf to tail, only look at the callee
+ // edges. This isn't an alloc node, so if there are no callee edges, the
+ // alloc type is None.
+ CurNode->AllocTypes = CurNode->CalleeEdges.empty()
+ ? (uint8_t)AllocationType::None
+ : CurNode->computeAllocType();
PrevNode = CurNode;
}
+ if (VerifyNodes) {
+ checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
+ for (auto Id : Ids) {
+ ContextNode *CurNode = getNodeForStackId(Id);
+ // We should only have kept stack ids that had nodes.
+ assert(CurNode);
+ checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
+ }
+ }
}
}
@@ -1319,7 +1428,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// Initialize the context ids with the last node's. We will subsequently
// refine the context ids by computing the intersection along all edges.
- DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
+ DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
assert(!LastNodeContextIds.empty());
for (unsigned I = 0; I < Calls.size(); I++) {
@@ -1442,6 +1551,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap)
assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
+ if (VerifyCCG)
+ check();
}
uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
@@ -1786,8 +1897,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
// First check if we have already synthesized a node for this tail call.
if (TailCallToContextNodeMap.count(NewCall)) {
NewNode = TailCallToContextNodeMap[NewCall];
- NewNode->ContextIds.insert(Edge->ContextIds.begin(),
- Edge->ContextIds.end());
NewNode->AllocTypes |= Edge->AllocTypes;
} else {
FuncToCallsWithMetadata[Func].push_back({NewCall});
@@ -1797,7 +1906,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
NewNode = NodeOwner.back().get();
NodeToCallingFunc[NewNode] = Func;
TailCallToContextNodeMap[NewCall] = NewNode;
- NewNode->ContextIds = Edge->ContextIds;
NewNode->AllocTypes = Edge->AllocTypes;
}
@@ -2091,6 +2199,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
OS << "\n";
OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
OS << "\tContextIds:";
+ // Make a copy of the computed context ids that we can sort for stability.
+ auto ContextIds = getContextIds();
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds)
@@ -2151,53 +2261,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
-static void checkEdge(
- const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
- // Confirm that alloc type is not None and that we have at least one context
- // id.
- assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
- assert(!Edge->ContextIds.empty());
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
- bool CheckEdges = true) {
- if (Node->isRemoved())
- return;
- // Node's context ids should be the union of both its callee and caller edge
- // context ids.
- if (Node->CallerEdges.size()) {
- auto EI = Node->CallerEdges.begin();
- auto &FirstEdge = *EI;
- EI++;
- DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
- for (; EI != Node->CallerEdges.end(); EI++) {
- const auto &Edge = *EI;
- if (CheckEdges)
- checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
- set_union(CallerEdgeContextIds, Edge->ContextIds);
- }
- // Node can have more context ids than callers if some contexts terminate at
- // node and some are longer.
- assert(Node->ContextIds == CallerEdgeContextIds ||
- set_is_subset(CallerEdgeContextIds, Node->ContextIds));
- }
- if (Node->CalleeEdges.size()) {
- auto EI = Node->CalleeEdges.begin();
- auto &FirstEdge = *EI;
- EI++;
- DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
- for (; EI != Node->CalleeEdges.end(); EI++) {
- const auto &Edge = *EI;
- if (CheckEdges)
- checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
- set_union(CalleeEdgeContextIds, Edge->ContextIds);
- }
- assert(Node->ContextIds == CalleeEdgeContextIds);
- }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
for (const auto Node : nodes<GraphType>(this)) {
@@ -2284,7 +2347,7 @@ struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
static std::string getNodeAttributes(NodeRef Node, GraphType) {
std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
- getContextIds(Node->ContextIds) + "\"")
+ getContextIds(Node->getContextIds()) + "\"")
.str();
AttributeString +=
(Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
@@ -2443,16 +2506,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
set_subtract(Edge->ContextIds, ContextIdsToMove);
Edge->AllocTypes = computeAllocType(Edge->ContextIds);
}
- // Now perform some updates that are common to all cases: the NewCallee gets
- // the moved ids added, and we need to remove those ids from OldCallee and
- // update its alloc type (NewCallee alloc type updates handled above).
- NewCallee->ContextIds.insert(ContextIdsToMove.begin(),
- ContextIdsToMove.end());
- set_subtract(OldCallee->ContextIds, ContextIdsToMove);
- OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
- // OldCallee alloc type should be None iff its context id set is now empty.
- assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
- OldCallee->ContextIds.empty());
// Now walk the old callee node's callee edges and move Edge's context ids
// over to the corresponding edge into the clone (which is created here if
// this is a newly created clone).
@@ -2484,6 +2537,12 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
NewCallee->CalleeEdges.push_back(NewEdge);
NewEdge->Callee->CallerEdges.push_back(NewEdge);
}
+ // Recompute the node alloc type now that its callee edges have been
+ // updated (since we will compute from those edges).
+ OldCallee->AllocTypes = OldCallee->computeAllocType();
+ // OldCallee alloc type should be None iff its context id set is now empty.
+ assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
+ OldCallee->emptyContextIds());
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
@@ -2528,7 +2587,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap) {
Visited.clear();
- identifyClones(Entry.second, Visited, Entry.second->ContextIds);
+ identifyClones(Entry.second, Visited, Entry.second->getContextIds());
}
Visited.clear();
for (auto &Entry : AllocationCallToContextNodeMap)
@@ -2714,7 +2773,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
}
// We should still have some context ids on the original Node.
- assert(!Node->ContextIds.empty());
+ assert(!Node->emptyContextIds());
// Sanity check that no alloc types on node or edges are None.
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
@@ -2918,7 +2977,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// find additional cloning is required.
std::deque<ContextNode *> ClonesWorklist;
// Ignore original Node if we moved all of its contexts to clones.
- if (!Node->ContextIds.empty())
+ if (!Node->emptyContextIds())
ClonesWorklist.push_back(Node);
ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
Node->Clones.end());
@@ -3258,7 +3317,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
// Skip if either no call to update, or if we ended up with no context ids
// (we moved all edges onto other clones).
- if (!Node->hasCall() || Node->ContextIds.empty())
+ if (!Node->hasCall() || Node->emptyContextIds())
return;
if (Node->IsAllocation) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 89193f8ff94b..38c1c2644554 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4745,6 +4745,29 @@ static Instruction *foldICmpAndXX(ICmpInst &I, const SimplifyQuery &Q,
Constant::getNullValue(Op1->getType()));
}
+ if (!ICmpInst::isSigned(Pred))
+ return nullptr;
+
+ KnownBits KnownY = IC.computeKnownBits(A, /*Depth=*/0, &I);
+ // (X & NegY) spred X --> (X & NegY) upred X
+ if (KnownY.isNegative())
+ return new ICmpInst(ICmpInst::getUnsignedPredicate(Pred), Op0, Op1);
+
+ if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGT)
+ return nullptr;
+
+ if (KnownY.isNonNegative())
+ // (X & PosY) s<= X --> X s>= 0
+ // (X & PosY) s> X --> X s< 0
+ return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Op1,
+ Constant::getNullValue(Op1->getType()));
+
+ if (isKnownNegative(Op1, IC.getSimplifyQuery().getWithInstruction(&I)))
+ // (NegX & Y) s<= NegX --> Y s< 0
+ // (NegX & Y) s> NegX --> Y s>= 0
+ return new ICmpInst(ICmpInst::getFlippedStrictnessPredicate(Pred), A,
+ Constant::getNullValue(A->getType()));
+
return nullptr;
}
@@ -4772,7 +4795,7 @@ static Instruction *foldICmpOrXX(ICmpInst &I, const SimplifyQuery &Q,
if (ICmpInst::isEquality(Pred) && Op0->hasOneUse()) {
// icmp (X | Y) eq/ne Y --> (X & ~Y) eq/ne 0 if Y is freely invertible
if (Value *NotOp1 =
- IC.getFreelyInverted(Op1, Op1->hasOneUse(), &IC.Builder))
+ IC.getFreelyInverted(Op1, !Op1->hasNUsesOrMore(3), &IC.Builder))
return new ICmpInst(Pred, IC.Builder.CreateAnd(A, NotOp1),
Constant::getNullValue(Op1->getType()));
// icmp (X | Y) eq/ne Y --> (~X | Y) eq/ne -1 if X is freely invertible.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 313beb7b6407..d2aaa5e23054 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1294,8 +1294,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
// X == Y ? X : Z with X == Y ? Y : Z, as that would lead to an infinite
// replacement cycle.
Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
- if (TrueVal != CmpLHS &&
- isGuaranteedNotToBeUndefOrPoison(CmpRHS, SQ.AC, &Sel, &DT)) {
+ if (TrueVal != CmpLHS && isGuaranteedNotToBeUndef(CmpRHS, SQ.AC, &Sel, &DT)) {
if (Value *V = simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ,
/* AllowRefinement */ true))
// Require either the replacement or the simplification result to be a
@@ -1316,8 +1315,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
if (replaceInInstruction(TrueVal, CmpLHS, CmpRHS))
return &Sel;
}
- if (TrueVal != CmpRHS &&
- isGuaranteedNotToBeUndefOrPoison(CmpLHS, SQ.AC, &Sel, &DT))
+ if (TrueVal != CmpRHS && isGuaranteedNotToBeUndef(CmpLHS, SQ.AC, &Sel, &DT))
if (Value *V = simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ,
/* AllowRefinement */ true))
if (isa<Constant>(CmpLHS) || isa<Constant>(V))
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 2aa21759d56e..a0e63bf12400 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -337,13 +337,17 @@ private:
unsigned AccessSizeIndex,
Instruction *InsertBefore, DomTreeUpdater &DTU,
LoopInfo *LI);
- bool ignoreMemIntrinsic(MemIntrinsic *MI);
+ bool ignoreMemIntrinsic(OptimizationRemarkEmitter &ORE, MemIntrinsic *MI);
void instrumentMemIntrinsic(MemIntrinsic *MI);
bool instrumentMemAccess(InterestingMemoryOperand &O, DomTreeUpdater &DTU,
LoopInfo *LI);
- bool ignoreAccess(Instruction *Inst, Value *Ptr);
+ bool ignoreAccessWithoutRemark(Instruction *Inst, Value *Ptr);
+ bool ignoreAccess(OptimizationRemarkEmitter &ORE, Instruction *Inst,
+ Value *Ptr);
+
void getInterestingMemoryOperands(
- Instruction *I, const TargetLibraryInfo &TLI,
+ OptimizationRemarkEmitter &ORE, Instruction *I,
+ const TargetLibraryInfo &TLI,
SmallVectorImpl<InterestingMemoryOperand> &Interesting);
void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
@@ -765,7 +769,8 @@ Value *HWAddressSanitizer::getShadowNonTls(IRBuilder<> &IRB) {
return IRB.CreateLoad(PtrTy, GlobalDynamicAddress);
}
-bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
+bool HWAddressSanitizer::ignoreAccessWithoutRemark(Instruction *Inst,
+ Value *Ptr) {
// Do not instrument accesses from different address spaces; we cannot deal
// with them.
Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
@@ -795,8 +800,23 @@ bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
return false;
}
+bool HWAddressSanitizer::ignoreAccess(OptimizationRemarkEmitter &ORE,
+ Instruction *Inst, Value *Ptr) {
+ bool Ignored = ignoreAccessWithoutRemark(Inst, Ptr);
+ if (Ignored) {
+ ORE.emit(
+ [&]() { return OptimizationRemark(DEBUG_TYPE, "ignoreAccess", Inst); });
+ } else {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ignoreAccess", Inst);
+ });
+ }
+ return Ignored;
+}
+
void HWAddressSanitizer::getInterestingMemoryOperands(
- Instruction *I, const TargetLibraryInfo &TLI,
+ OptimizationRemarkEmitter &ORE, Instruction *I,
+ const TargetLibraryInfo &TLI,
SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
// Skip memory accesses inserted by another instrumentation.
if (I->hasMetadata(LLVMContext::MD_nosanitize))
@@ -807,22 +827,22 @@ void HWAddressSanitizer::getInterestingMemoryOperands(
return;
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- if (!ClInstrumentReads || ignoreAccess(I, LI->getPointerOperand()))
+ if (!ClInstrumentReads || ignoreAccess(ORE, I, LI->getPointerOperand()))
return;
Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
LI->getType(), LI->getAlign());
} else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- if (!ClInstrumentWrites || ignoreAccess(I, SI->getPointerOperand()))
+ if (!ClInstrumentWrites || ignoreAccess(ORE, I, SI->getPointerOperand()))
return;
Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
SI->getValueOperand()->getType(), SI->getAlign());
} else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
- if (!ClInstrumentAtomics || ignoreAccess(I, RMW->getPointerOperand()))
+ if (!ClInstrumentAtomics || ignoreAccess(ORE, I, RMW->getPointerOperand()))
return;
Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
RMW->getValOperand()->getType(), std::nullopt);
} else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
- if (!ClInstrumentAtomics || ignoreAccess(I, XCHG->getPointerOperand()))
+ if (!ClInstrumentAtomics || ignoreAccess(ORE, I, XCHG->getPointerOperand()))
return;
Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
XCHG->getCompareOperand()->getType(),
@@ -830,7 +850,7 @@ void HWAddressSanitizer::getInterestingMemoryOperands(
} else if (auto *CI = dyn_cast<CallInst>(I)) {
for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ArgNo++) {
if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
- ignoreAccess(I, CI->getArgOperand(ArgNo)))
+ ignoreAccess(ORE, I, CI->getArgOperand(ArgNo)))
continue;
Type *Ty = CI->getParamByValType(ArgNo);
Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
@@ -1035,13 +1055,14 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
->setSuccessor(0, TCI.TagMismatchTerm->getParent());
}
-bool HWAddressSanitizer::ignoreMemIntrinsic(MemIntrinsic *MI) {
+bool HWAddressSanitizer::ignoreMemIntrinsic(OptimizationRemarkEmitter &ORE,
+ MemIntrinsic *MI) {
if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
- return (!ClInstrumentWrites || ignoreAccess(MTI, MTI->getDest())) &&
- (!ClInstrumentReads || ignoreAccess(MTI, MTI->getSource()));
+ return (!ClInstrumentWrites || ignoreAccess(ORE, MTI, MTI->getDest())) &&
+ (!ClInstrumentReads || ignoreAccess(ORE, MTI, MTI->getSource()));
}
if (isa<MemSetInst>(MI))
- return !ClInstrumentWrites || ignoreAccess(MI, MI->getDest());
+ return !ClInstrumentWrites || ignoreAccess(ORE, MI, MI->getDest());
return false;
}
@@ -1541,6 +1562,9 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
NumTotalFuncs++;
+ OptimizationRemarkEmitter &ORE =
+ FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
if (selectiveInstrumentationShouldSkip(F, FAM))
return;
@@ -1562,10 +1586,10 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
LandingPadVec.push_back(&Inst);
- getInterestingMemoryOperands(&Inst, TLI, OperandsToInstrument);
+ getInterestingMemoryOperands(ORE, &Inst, TLI, OperandsToInstrument);
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst))
- if (!ignoreMemIntrinsic(MI))
+ if (!ignoreMemIntrinsic(ORE, MI))
IntrinToInstrument.push_back(MI);
}
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index ba2546b8db0e..4371b821eae6 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -827,7 +827,8 @@ private:
return false;
}
- if (Metrics.convergent) {
+ // FIXME: Allow jump threading with controlled convergence.
+ if (Metrics.Convergence != ConvergenceKind::None) {
LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains "
<< "convergent instructions.\n");
ORE->emit([&]() {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 7b4c54370e48..f8e2f1f28088 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -327,8 +327,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
UnrollCostEstimator OuterUCE(L, TTI, EphValues, UP.BEInsns);
if (!InnerUCE.canUnroll() || !OuterUCE.canUnroll()) {
- LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions"
- << " which cannot be duplicated or have invalid cost.\n");
+ LLVM_DEBUG(dbgs() << " Loop not considered unrollable\n");
return LoopUnrollResult::Unmodified;
}
@@ -341,7 +340,10 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
return LoopUnrollResult::Unmodified;
}
- if (InnerUCE.Convergent || OuterUCE.Convergent) {
+ // FIXME: The call to canUnroll() allows some controlled convergent
+ // operations, but we block them here for future changes.
+ if (InnerUCE.Convergence != ConvergenceKind::None ||
+ OuterUCE.Convergence != ConvergenceKind::None) {
LLVM_DEBUG(
dbgs() << " Not unrolling loop with convergent instructions.\n");
return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 10fc9e9303e8..cbc35b6dd429 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -684,11 +684,15 @@ UnrollCostEstimator::UnrollCostEstimator(
const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
- Metrics.analyzeBasicBlock(BB, TTI, EphValues);
+ Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false,
+ L);
NumInlineCandidates = Metrics.NumInlineCandidates;
NotDuplicatable = Metrics.notDuplicatable;
- Convergent = Metrics.convergent;
+ Convergence = Metrics.Convergence;
LoopSize = Metrics.NumInsts;
+ ConvergenceAllowsRuntime =
+ Metrics.Convergence != ConvergenceKind::Uncontrolled &&
+ !getLoopConvergenceHeart(L);
// Don't allow an estimate of size zero. This would allows unrolling of loops
// with huge iteration counts, which is a compile time problem even if it's
@@ -701,6 +705,25 @@ UnrollCostEstimator::UnrollCostEstimator(
LoopSize = BEInsns + 1;
}
+bool UnrollCostEstimator::canUnroll() const {
+ switch (Convergence) {
+ case ConvergenceKind::ExtendedLoop:
+ LLVM_DEBUG(dbgs() << " Convergence prevents unrolling.\n");
+ return false;
+ default:
+ break;
+ }
+ if (!LoopSize.isValid()) {
+ LLVM_DEBUG(dbgs() << " Invalid loop size prevents unrolling.\n");
+ return false;
+ }
+ if (NotDuplicatable) {
+ LLVM_DEBUG(dbgs() << " Non-duplicatable blocks prevent unrolling.\n");
+ return false;
+ }
+ return true;
+}
+
uint64_t UnrollCostEstimator::getUnrolledLoopSize(
const TargetTransformInfo::UnrollingPreferences &UP,
unsigned CountOverwrite) const {
@@ -1206,8 +1229,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
if (!UCE.canUnroll()) {
- LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions"
- << " which cannot be duplicated or have invalid cost.\n");
+ LLVM_DEBUG(dbgs() << " Loop not considered unrollable.\n");
return LoopUnrollResult::Unmodified;
}
@@ -1254,15 +1276,9 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
// is unsafe -- it adds a control-flow dependency to the convergent
// operation. Therefore restrict remainder loop (try unrolling without).
//
- // TODO: This is quite conservative. In practice, convergent_op()
- // is likely to be called unconditionally in the loop. In this
- // case, the program would be ill-formed (on most architectures)
- // unless n were the same on all threads in a thread group.
- // Assuming n is the same on all threads, any kind of unrolling is
- // safe. But currently llvm's notion of convergence isn't powerful
- // enough to express this.
- if (UCE.Convergent)
- UP.AllowRemainder = false;
+ // TODO: This is somewhat conservative; we could allow the remainder if the
+ // trip count is uniform.
+ UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime;
// Try to find the trip count upper bound if we cannot find the exact trip
// count.
@@ -1282,6 +1298,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
if (!UP.Count)
return LoopUnrollResult::Unmodified;
+ UP.Runtime &= UCE.ConvergenceAllowsRuntime;
+
if (PP.PeelCount) {
assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
@@ -1324,11 +1342,16 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
// Unroll the loop.
Loop *RemainderLoop = nullptr;
+ UnrollLoopOptions ULO;
+ ULO.Count = UP.Count;
+ ULO.Force = UP.Force;
+ ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount;
+ ULO.UnrollRemainder = UP.UnrollRemainder;
+ ULO.Runtime = UP.Runtime;
+ ULO.ForgetAllSCEV = ForgetAllSCEV;
+ ULO.Heart = getLoopConvergenceHeart(L);
LoopUnrollResult UnrollResult = UnrollLoop(
- L,
- {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
- UP.UnrollRemainder, ForgetAllSCEV},
- LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
+ L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
if (UnrollResult == LoopUnrollResult::Unmodified)
return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index eb471b259c7d..cfe63496a100 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1221,7 +1221,6 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
SmallPtrSet<const Value *, 4> ObjSet;
SmallVector<Metadata *, 4> Scopes, NoAliases;
- SmallSetVector<const Argument *, 4> NAPtrArgs;
for (const Value *V : PtrArgs) {
SmallVector<const Value *, 4> Objects;
getUnderlyingObjects(V, Objects, /* LI = */ nullptr);
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 08ba65d9483e..3d950b151cd3 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -460,7 +460,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
L->dump());
return Rotated;
}
- if (Metrics.convergent) {
+ if (Metrics.Convergence != ConvergenceKind::None) {
LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
"instructions: ";
L->dump());
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 1216538195fb..90d7b99e9d81 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -419,6 +419,26 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
}
}
+// Loops containing convergent instructions that are uncontrolled or controlled
+// from outside the loop must have a count that divides their TripMultiple.
+LLVM_ATTRIBUTE_USED
+static bool canHaveUnrollRemainder(const Loop *L) {
+ if (getLoopConvergenceHeart(L))
+ return false;
+
+ // Check for uncontrolled convergent operations.
+ for (auto &BB : L->blocks()) {
+ for (auto &I : *BB) {
+ if (isa<ConvergenceControlInst>(I))
+ return true;
+ if (auto *CB = dyn_cast<CallBase>(&I))
+ if (CB->isConvergent())
+ return CB->getConvergenceControlToken();
+ }
+ }
+ return true;
+}
+
/// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
/// can only fail when the loop's latch block is not terminated by a conditional
/// branch instruction. However, if the trip count (and multiple) are not known,
@@ -564,19 +584,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
return LoopUnrollResult::Unmodified;
}
- // Loops containing convergent instructions cannot use runtime unrolling,
- // as the prologue/epilogue may add additional control-dependencies to
- // convergent operations.
- LLVM_DEBUG(
- {
- bool HasConvergent = false;
- for (auto &BB : L->blocks())
- for (auto &I : *BB)
- if (auto *CB = dyn_cast<CallBase>(&I))
- HasConvergent |= CB->isConvergent();
- assert((!HasConvergent || !ULO.Runtime) &&
- "Can't runtime unroll if loop contains a convergent operation.");
- });
+ assert((!ULO.Runtime || canHaveUnrollRemainder(L)) &&
+ "Can't runtime unroll if loop contains a convergent operation.");
bool EpilogProfitability =
UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
@@ -722,7 +731,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
if (OldLoop)
LoopsToSimplify.insert(NewLoops[OldLoop]);
- if (*BB == Header)
+ if (*BB == Header) {
// Loop over all of the PHI nodes in the block, changing them to use
// the incoming values from the previous block.
for (PHINode *OrigPHI : OrigPHINode) {
@@ -735,6 +744,16 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
NewPHI->eraseFromParent();
}
+ // Eliminate copies of the loop heart intrinsic, if any.
+ if (ULO.Heart) {
+ auto it = VMap.find(ULO.Heart);
+ assert(it != VMap.end());
+ Instruction *heartCopy = cast<Instruction>(it->second);
+ heartCopy->eraseFromParent();
+ VMap.erase(it);
+ }
+ }
+
// Update our running map of newest clones
LastValueMap[*BB] = New;
for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index e1af02829c1d..dd7150bc63ec 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -1016,12 +1016,17 @@ bool llvm::UnrollRuntimeLoopRemainder(
auto UnrollResult = LoopUnrollResult::Unmodified;
if (remainderLoop && UnrollRemainder) {
LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
- UnrollResult =
- UnrollLoop(remainderLoop,
- {/*Count*/ Count - 1, /*Force*/ false, /*Runtime*/ false,
- /*AllowExpensiveTripCount*/ false,
- /*UnrollRemainder*/ false, ForgetAllSCEV},
- LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
+ UnrollLoopOptions ULO;
+ ULO.Count = Count - 1;
+ ULO.Force = false;
+ ULO.Runtime = false;
+ ULO.AllowExpensiveTripCount = false;
+ ULO.UnrollRemainder = false;
+ ULO.ForgetAllSCEV = ForgetAllSCEV;
+ assert(!getLoopConvergenceHeart(L) &&
+ "A loop with a convergence heart does not allow runtime unrolling.");
+ UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI,
+ /*ORE*/ nullptr, PreserveLCSSA);
}
if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 058746880743..d6b4acb2bdba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -274,6 +274,13 @@ m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
}
+template <typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul,
+ /* Commutative =*/true>
+m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_Binary<Instruction::Mul, Op0_t, Op1_t, true>(Op0, Op1);
+}
+
/// Match a binary OR operation. Note that while conceptually the operands can
/// be matched commutatively, \p Commutative defaults to false in line with the
/// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ab3b5cf2b9da..8ec67eb2f54b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1037,8 +1037,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)),
- m_Mul(m_SpecificInt(1), m_VPValue(A)))))
+ if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
return R.getVPSingleValue()->replaceAllUsesWith(A);
}