diff options
Diffstat (limited to 'llvm/lib')
100 files changed, 4032 insertions, 743 deletions
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp index 2637e2f97dbb..ea67b526423b 100644 --- a/llvm/lib/Analysis/CodeMetrics.cpp +++ b/llvm/lib/Analysis/CodeMetrics.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/InstructionCost.h" @@ -111,11 +112,24 @@ void CodeMetrics::collectEphemeralValues( completeEphemeralValues(Visited, Worklist, EphValues); } +static bool extendsConvergenceOutsideLoop(const Instruction &I, const Loop *L) { + if (!L) + return false; + if (!isa<ConvergenceControlInst>(I)) + return false; + for (const auto *U : I.users()) { + if (!L->contains(cast<Instruction>(U))) + return true; + } + return false; +} + /// Fill in the current structure with information gleaned from the specified /// block. void CodeMetrics::analyzeBasicBlock( const BasicBlock *BB, const TargetTransformInfo &TTI, - const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) { + const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO, + const Loop *L) { ++NumBlocks; InstructionCost NumInstsBeforeThisBB = NumInsts; for (const Instruction &I : *BB) { @@ -163,19 +177,38 @@ void CodeMetrics::analyzeBasicBlock( if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy()) ++NumVectorInsts; - if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) + if (I.getType()->isTokenTy() && !isa<ConvergenceControlInst>(I) && + I.isUsedOutsideOfBlock(BB)) { + LLVM_DEBUG(dbgs() << I + << "\n Cannot duplicate a token value used outside " + "the current block (except convergence control).\n"); notDuplicatable = true; - - if (const CallInst *CI = dyn_cast<CallInst>(&I)) { - if (CI->cannotDuplicate()) - notDuplicatable = true; - if (CI->isConvergent()) - convergent = true; } - if (const InvokeInst *InvI = dyn_cast<InvokeInst>(&I)) - if (InvI->cannotDuplicate()) + if (const CallBase *CB = dyn_cast<CallBase>(&I)) { + if (CB->cannotDuplicate()) notDuplicatable = true; + // Compute a meet over the visited blocks for the following partial order: + // + // None -> { Controlled, ExtendedLoop, Uncontrolled} + // Controlled -> ExtendedLoop + if (Convergence <= ConvergenceKind::Controlled && CB->isConvergent()) { + if (isa<ConvergenceControlInst>(CB) || + CB->getConvergenceControlToken()) { + assert(Convergence != ConvergenceKind::Uncontrolled); + LLVM_DEBUG(dbgs() << "Found controlled convergence:\n" << I << "\n"); + if (extendsConvergenceOutsideLoop(I, L)) + Convergence = ConvergenceKind::ExtendedLoop; + else { + assert(Convergence != ConvergenceKind::ExtendedLoop); + Convergence = ConvergenceKind::Controlled; + } + } else { + assert(Convergence == ConvergenceKind::None); + Convergence = ConvergenceKind::Uncontrolled; + } + } + } NumInsts += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); } diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp index 369ab087ffc0..c34c4974382e 100644 --- a/llvm/lib/Analysis/LoopInfo.cpp +++ b/llvm/lib/Analysis/LoopInfo.cpp @@ -1105,6 +1105,26 @@ int llvm::getIntLoopAttribute(const Loop *TheLoop, StringRef Name, return getOptionalIntLoopAttribute(TheLoop, Name).value_or(Default); } +CallBase *llvm::getLoopConvergenceHeart(const Loop *TheLoop) { + BasicBlock *H = TheLoop->getHeader(); + for (Instruction &II : *H) { + if (auto *CB = dyn_cast<CallBase>(&II)) { + if (!CB->isConvergent()) + continue; + // This is the heart if it uses a token defined outside the loop. The + // verifier has already checked that only the loop intrinsic can use such + // a token. + if (auto *Token = CB->getConvergenceControlToken()) { + auto *TokenDef = cast<Instruction>(Token); + if (!TheLoop->contains(TokenDef->getParent())) + return CB; + } + return nullptr; + } + } + return nullptr; +} + bool llvm::isFinite(const Loop *L) { return L->getHeader()->getParent()->willReturn(); } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 08138a5e2f2d..782c28c94483 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7296,10 +7296,13 @@ static bool isGuaranteedNotToBeUndefOrPoison( isa<ConstantPointerNull>(C) || isa<Function>(C)) return true; - if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C)) - return (!includesUndef(Kind) ? !C->containsPoisonElement() - : !C->containsUndefOrPoisonElement()) && - !C->containsConstantExpression(); + if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C)) { + if (includesUndef(Kind) && C->containsUndefElement()) + return false; + if (includesPoison(Kind) && C->containsPoisonElement()) + return false; + return !C->containsConstantExpression(); + } } // Strip cast operations from a pointer value. diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 917094267d05..30728ed58750 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -68,6 +68,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::sqrt: // Begin floating-point. case Intrinsic::sin: case Intrinsic::cos: + case Intrinsic::tan: case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::log: diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index d3ab306904da..7d7fe19568e8 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -604,6 +604,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(aarch64_vector_pcs); KEYWORD(aarch64_sve_vector_pcs); KEYWORD(aarch64_sme_preservemost_from_x0); + KEYWORD(aarch64_sme_preservemost_from_x1); KEYWORD(aarch64_sme_preservemost_from_x2); KEYWORD(msp430_intrcc); KEYWORD(avr_intrcc); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 07c8aa23fc5e..f0fde9ae4df5 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2153,6 +2153,7 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) { /// ::= 'aarch64_vector_pcs' /// ::= 'aarch64_sve_vector_pcs' /// ::= 'aarch64_sme_preservemost_from_x0' +/// ::= 'aarch64_sme_preservemost_from_x1' /// ::= 'aarch64_sme_preservemost_from_x2' /// ::= 'msp430_intrcc' /// ::= 'avr_intrcc' @@ -2212,6 +2213,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) { case lltok::kw_aarch64_sme_preservemost_from_x0: CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0; break; + case lltok::kw_aarch64_sme_preservemost_from_x1: + CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1; + break; case lltok::kw_aarch64_sme_preservemost_from_x2: CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2; break; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp index b4765fb280f9..66b1c5f8ca82 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp @@ -6,7 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This file implements CombinerHelper for G_EXTRACT_VECTOR_ELT. +// This file implements CombinerHelper for G_EXTRACT_VECTOR_ELT, +// G_INSERT_VECTOR_ELT, and G_VSCALE // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -400,3 +401,86 @@ bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI, return false; } + +bool CombinerHelper::matchAddOfVScale(const MachineOperand &MO, + BuildFnTy &MatchInfo) { + GAdd *Add = cast<GAdd>(MRI.getVRegDef(MO.getReg())); + GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getLHSReg())); + GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getRHSReg())); + + Register Dst = Add->getReg(0); + + if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)) || + !MRI.hasOneNonDBGUse(RHSVScale->getReg(0))) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildVScale(Dst, LHSVScale->getSrc() + RHSVScale->getSrc()); + }; + + return true; +} + +bool CombinerHelper::matchMulOfVScale(const MachineOperand &MO, + BuildFnTy &MatchInfo) { + GMul *Mul = cast<GMul>(MRI.getVRegDef(MO.getReg())); + GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Mul->getLHSReg())); + + std::optional<APInt> MaybeRHS = getIConstantVRegVal(Mul->getRHSReg(), MRI); + if (!MaybeRHS) + return false; + + Register Dst = MO.getReg(); + + if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0))) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildVScale(Dst, LHSVScale->getSrc() * *MaybeRHS); + }; + + return true; +} + +bool CombinerHelper::matchSubOfVScale(const MachineOperand &MO, + BuildFnTy &MatchInfo) { + GSub *Sub = cast<GSub>(MRI.getVRegDef(MO.getReg())); + GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Sub->getRHSReg())); + + Register Dst = MO.getReg(); + LLT DstTy = MRI.getType(Dst); + + if (!MRI.hasOneNonDBGUse(RHSVScale->getReg(0)) || + !isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, DstTy})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + auto VScale = B.buildVScale(DstTy, -RHSVScale->getSrc()); + B.buildAdd(Dst, Sub->getLHSReg(), VScale, Sub->getFlags()); + }; + + return true; +} + +bool CombinerHelper::matchShlOfVScale(const MachineOperand &MO, + BuildFnTy &MatchInfo) { + GShl *Shl = cast<GShl>(MRI.getVRegDef(MO.getReg())); + GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Shl->getSrcReg())); + + std::optional<APInt> MaybeRHS = getIConstantVRegVal(Shl->getShiftReg(), MRI); + if (!MaybeRHS) + return false; + + Register Dst = MO.getReg(); + LLT DstTy = MRI.getType(Dst); + + if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)) || + !isLegalOrBeforeLegalizer({TargetOpcode::G_VSCALE, DstTy})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildVScale(Dst, LHSVScale->getSrc().shl(*MaybeRHS)); + }; + + return true; +} diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 6f0cae2edab1..9830b521797c 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -449,6 +449,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE(SIN_F); case TargetOpcode::G_FCOS: RTLIBCASE(COS_F); + case TargetOpcode::G_FTAN: + RTLIBCASE(TAN_F); case TargetOpcode::G_FLOG10: RTLIBCASE(LOG10_F); case TargetOpcode::G_FLOG: @@ -1037,6 +1039,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { case TargetOpcode::G_FREM: case TargetOpcode::G_FCOS: case TargetOpcode::G_FSIN: + case TargetOpcode::G_FTAN: case TargetOpcode::G_FLOG10: case TargetOpcode::G_FLOG: case TargetOpcode::G_FLOG2: @@ -2893,6 +2896,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_FFLOOR: case TargetOpcode::G_FCOS: case TargetOpcode::G_FSIN: + case TargetOpcode::G_FTAN: case TargetOpcode::G_FLOG10: case TargetOpcode::G_FLOG: case TargetOpcode::G_FLOG2: @@ -4659,6 +4663,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_INTRINSIC_TRUNC: case G_FCOS: case G_FSIN: + case G_FTAN: case G_FSQRT: case G_BSWAP: case G_BITREVERSE: diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index e8438be94b3c..129e6963aef3 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -833,6 +833,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, case TargetOpcode::G_FREM: case TargetOpcode::G_FSIN: case TargetOpcode::G_FCOS: + case TargetOpcode::G_FTAN: case TargetOpcode::G_FMA: case TargetOpcode::G_FMAD: if (SNaN) @@ -1713,6 +1714,7 @@ bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) { case TargetOpcode::G_FREM: case TargetOpcode::G_FRINT: case TargetOpcode::G_FSIN: + case TargetOpcode::G_FTAN: case TargetOpcode::G_FSQRT: case TargetOpcode::G_FSUB: case TargetOpcode::G_INTRINSIC_ROUND: diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 3397bd0a6060..a808a541103f 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1339,14 +1339,13 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, if (SrcIdx && DstIdx) return false; - [[maybe_unused]] const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg(); + const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg(); const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF); if (!DefMI->isImplicitDef()) { if (DstReg.isPhysical()) { Register NewDstReg = DstReg; - unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), - DefMI->getOperand(0).getSubReg()); + unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), DefSubIdx); if (NewDstIdx) NewDstReg = TRI->getSubReg(DstReg, NewDstIdx); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9a5359015439..02cd125eeff0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4041,17 +4041,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { return DAG.getNode(ISD::ADD, DL, VT, N0, SExt); } - // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X) - if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { - if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) { - SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1); - SDValue S0 = N1.getOperand(0); - if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) - if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) - if (C->getAPIntValue() == (BitWidth - 1)) - return DAG.getNode(ISD::ABS, DL, VT, S0); - } - } + // fold B = sra (A, size(A)-1); sub (xor (A, B), B) -> (abs A) + if (hasOperation(ISD::ABS, VT) && + sd_match(N1, m_Sra(m_Value(A), m_SpecificInt(BitWidth - 1))) && + sd_match(N0, m_Xor(m_Specific(A), m_Specific(N1)))) + return DAG.getNode(ISD::ABS, DL, VT, A); // If the relocation model supports it, consider symbol offsets. if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 8cd2bb60d81f..27c45cab2e0d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4514,6 +4514,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::COS_F80, RTLIB::COS_F128, RTLIB::COS_PPCF128, Results); break; + case ISD::FTAN: + case ISD::STRICT_FTAN: + ExpandFPLibCall(Node, RTLIB::TAN_F32, RTLIB::TAN_F64, RTLIB::TAN_F80, + RTLIB::TAN_F128, RTLIB::TAN_PPCF128, Results); + break; case ISD::FSINCOS: // Expand into sincos libcall. ExpandSinCosLibCall(Node, Results); @@ -5468,6 +5473,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::FSQRT: case ISD::FSIN: case ISD::FCOS: + case ISD::FTAN: case ISD::FLOG: case ISD::FLOG2: case ISD::FLOG10: @@ -5492,6 +5498,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::STRICT_FSQRT: case ISD::STRICT_FSIN: case ISD::STRICT_FCOS: + case ISD::STRICT_FTAN: case ISD::STRICT_FLOG: case ISD::STRICT_FLOG2: case ISD::STRICT_FLOG10: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index fb1424f75e09..aa116c9de5d8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -131,6 +131,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::FSQRT: R = SoftenFloatRes_FSQRT(N); break; case ISD::STRICT_FSUB: case ISD::FSUB: R = SoftenFloatRes_FSUB(N); break; + case ISD::STRICT_FTAN: + case ISD::FTAN: R = SoftenFloatRes_FTAN(N); break; case ISD::STRICT_FTRUNC: case ISD::FTRUNC: R = SoftenFloatRes_FTRUNC(N); break; case ISD::LOAD: R = SoftenFloatRes_LOAD(N); break; @@ -774,6 +776,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) { RTLIB::SUB_PPCF128)); } +SDValue DAGTypeLegalizer::SoftenFloatRes_FTAN(SDNode *N) { + return SoftenFloatRes_Unary( + N, GetFPLibCall(N->getValueType(0), RTLIB::TAN_F32, RTLIB::TAN_F64, + RTLIB::TAN_F80, RTLIB::TAN_F128, RTLIB::TAN_PPCF128)); +} + SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) { return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), RTLIB::TRUNC_F32, @@ -1330,7 +1338,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { #endif report_fatal_error("Do not know how to expand the result of this " "operator!"); - + // clang-format off case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; @@ -1399,6 +1407,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::FSQRT: ExpandFloatRes_FSQRT(N, Lo, Hi); break; case ISD::STRICT_FSUB: case ISD::FSUB: ExpandFloatRes_FSUB(N, Lo, Hi); break; + case ISD::STRICT_FTAN: + case ISD::FTAN: ExpandFloatRes_FTAN(N, Lo, Hi); break; case ISD::STRICT_FTRUNC: case ISD::FTRUNC: ExpandFloatRes_FTRUNC(N, Lo, Hi); break; case ISD::LOAD: ExpandFloatRes_LOAD(N, Lo, Hi); break; @@ -1408,6 +1418,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break; case ISD::STRICT_FREM: case ISD::FREM: ExpandFloatRes_FREM(N, Lo, Hi); break; + // clang-format on } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -1768,6 +1779,15 @@ void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo, RTLIB::SUB_PPCF128), Lo, Hi); } +void DAGTypeLegalizer::ExpandFloatRes_FTAN(SDNode *N, SDValue &Lo, + SDValue &Hi) { + ExpandFloatRes_Unary(N, + GetFPLibCall(N->getValueType(0), RTLIB::TAN_F32, + RTLIB::TAN_F64, RTLIB::TAN_F80, + RTLIB::TAN_F128, RTLIB::TAN_PPCF128), + Lo, Hi); +} + void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N, SDValue &Lo, SDValue &Hi) { ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), @@ -2479,6 +2499,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::FSIN: case ISD::FSQRT: case ISD::FTRUNC: + case ISD::FTAN: case ISD::FCANONICALIZE: R = PromoteFloatRes_UnaryOp(N); break; // Binary FP Operations @@ -2914,6 +2935,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FSIN: case ISD::FSQRT: case ISD::FTRUNC: + case ISD::FTAN: case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break; // Binary FP Operations diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index bec9cb49b586..2350b562a034 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -586,6 +586,7 @@ private: SDValue SoftenFloatRes_FSIN(SDNode *N); SDValue SoftenFloatRes_FSQRT(SDNode *N); SDValue SoftenFloatRes_FSUB(SDNode *N); + SDValue SoftenFloatRes_FTAN(SDNode *N); SDValue SoftenFloatRes_FTRUNC(SDNode *N); SDValue SoftenFloatRes_LOAD(SDNode *N); SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N); @@ -635,6 +636,7 @@ private: SDValue &Lo, SDValue &Hi); void ExpandFloatRes_Binary(SDNode *N, RTLIB::Libcall LC, SDValue &Lo, SDValue &Hi); + // clang-format off void ExpandFloatRes_FABS (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMINNUM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMAXNUM (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -667,9 +669,11 @@ private: void ExpandFloatRes_FSIN (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FSQRT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FSUB (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandFloatRes_FTAN (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FTRUNC (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi); + // clang-format on // Float Operand Expansion. bool ExpandFloatOperand(SDNode *N, unsigned OpNo); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 6acbc044d673..8cdb4ba0ade6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -397,6 +397,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FSQRT: case ISD::FSIN: case ISD::FCOS: + case ISD::FTAN: case ISD::FLDEXP: case ISD::FPOWI: case ISD::FPOW: @@ -506,7 +507,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { break; \ } \ /* Defer non-vector results to LegalizeDAG. */ \ - if (!Node->getValueType(0).isVector()) { \ + if (!Node->getValueType(0).isVector() && \ + Node->getValueType(0) != MVT::Other) { \ Action = TargetLowering::Legal; \ break; \ } \ @@ -990,11 +992,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { break; case ISD::FMINIMUM: case ISD::FMAXIMUM: - if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) { - Results.push_back(Expanded); - return; - } - break; + Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)); + return; case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 361416edb554..92ce3b17ed6c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -108,6 +108,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::FROUNDEVEN: case ISD::FSIN: case ISD::FSQRT: + case ISD::FTAN: case ISD::FTRUNC: case ISD::SIGN_EXTEND: case ISD::SINT_TO_FP: @@ -1140,6 +1141,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_FROUNDEVEN: case ISD::FSIN: case ISD::FSQRT: case ISD::VP_SQRT: + case ISD::FTAN: case ISD::FTRUNC: case ISD::VP_FROUNDTOZERO: case ISD::SINT_TO_FP: @@ -4400,6 +4402,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::FROUNDEVEN: case ISD::FSIN: case ISD::FSQRT: + case ISD::FTAN: case ISD::FTRUNC: if (unrollExpandedOp()) break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 4a6a431696b5..e176cf2cc2a6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5375,6 +5375,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const case ISD::FREM: case ISD::FSIN: case ISD::FCOS: + case ISD::FTAN: case ISD::FMA: case ISD::FMAD: { if (SNaN) @@ -6332,7 +6333,8 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) { } SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, - EVT VT, ArrayRef<SDValue> Ops) { + EVT VT, ArrayRef<SDValue> Ops, + SDNodeFlags Flags) { // If the opcode is a target-specific ISD node, there's nothing we can // do here and the operand rules may not line up with the below, so // bail early. @@ -6689,7 +6691,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, } // Constant fold the scalar operands. - SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps); + SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags); // Legalize the (integer) scalar constant if necessary. if (LegalSVT != SVT) @@ -7260,7 +7262,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } // Perform trivial constant folding. - if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2})) + if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags)) return SV; // Canonicalize an UNDEF to the RHS, even over a constant. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ba76456b5836..2f3626f1c820 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1684,7 +1684,7 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values, if (!FragmentExpr) continue; SDDbgValue *SDV = DAG.getVRegDbgValue( - Var, *FragmentExpr, RegAndSize.first, false, DbgLoc, SDNodeOrder); + Var, *FragmentExpr, RegAndSize.first, false, DbgLoc, Order); DAG.AddDbgValue(SDV, false); Offset += RegisterSize; } @@ -1699,11 +1699,10 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values, } // We have created a SDDbgOperand for each Value in Values. - // Should use Order instead of SDNodeOrder? assert(!LocationOps.empty()); - SDDbgValue *SDV = DAG.getDbgValueList(Var, Expr, LocationOps, Dependencies, - /*IsIndirect=*/false, DbgLoc, - SDNodeOrder, IsVariadic); + SDDbgValue *SDV = + DAG.getDbgValueList(Var, Expr, LocationOps, Dependencies, + /*IsIndirect=*/false, DbgLoc, Order, IsVariadic); DAG.AddDbgValue(SDV, /*isParameter=*/false); return true; } @@ -6742,6 +6741,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::fabs: case Intrinsic::sin: case Intrinsic::cos: + case Intrinsic::tan: case Intrinsic::exp10: case Intrinsic::floor: case Intrinsic::ceil: @@ -6759,6 +6759,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::fabs: Opcode = ISD::FABS; break; case Intrinsic::sin: Opcode = ISD::FSIN; break; case Intrinsic::cos: Opcode = ISD::FCOS; break; + case Intrinsic::tan: Opcode = ISD::FTAN; break; case Intrinsic::exp10: Opcode = ISD::FEXP10; break; case Intrinsic::floor: Opcode = ISD::FFLOOR; break; case Intrinsic::ceil: Opcode = ISD::FCEIL; break; @@ -9160,6 +9161,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { if (visitUnaryFloatCall(I, ISD::FCOS)) return; break; + case LibFunc_tan: + case LibFunc_tanf: + case LibFunc_tanl: + if (visitUnaryFloatCall(I, ISD::FTAN)) + return; + break; case LibFunc_sqrt: case LibFunc_sqrtf: case LibFunc_sqrtl: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 2198c2354483..52da24b59451 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -210,6 +210,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FCOS: return "fcos"; case ISD::STRICT_FCOS: return "strict_fcos"; case ISD::FSINCOS: return "fsincos"; + case ISD::FTAN: return "ftan"; + case ISD::STRICT_FTAN: return "strict_ftan"; case ISD::FTRUNC: return "ftrunc"; case ISD::STRICT_FTRUNC: return "strict_ftrunc"; case ISD::FFLOOR: return "ffloor"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index f856c8a51984..e1c1a6b09b11 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8427,10 +8427,6 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, bool IsMax = Opc == ISD::FMAXIMUM; SDNodeFlags Flags = N->getFlags(); - if (VT.isVector() && - isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType())) - return SDValue(); - // First, implement comparison not propagating NaN. If no native fmin or fmax // available, use plain select with setcc instead. SDValue MinMax; @@ -8447,6 +8443,9 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, } else if (isOperationLegalOrCustom(CompOpc, VT)) { MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags); } else { + if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) + return DAG.UnrollVectorOp(N); + // NaN (if exists) will be propagated later, so orderness doesn't matter. SDValue Compare = DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT); @@ -9159,6 +9158,7 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG, if (!IsNegative && isOperationLegal(ISD::SUB, VT) && isOperationLegal(ISD::SMAX, VT)) { SDValue Zero = DAG.getConstant(0, dl, VT); + Op = DAG.getFreeze(Op); return DAG.getNode(ISD::SMAX, dl, VT, Op, DAG.getNode(ISD::SUB, dl, VT, Zero, Op)); } @@ -9175,8 +9175,8 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG, // 0 - abs(x) -> smin(x, sub(0,x)) if (IsNegative && isOperationLegal(ISD::SUB, VT) && isOperationLegal(ISD::SMIN, VT)) { - Op = DAG.getFreeze(Op); SDValue Zero = DAG.getConstant(0, dl, VT); + Op = DAG.getFreeze(Op); return DAG.getNode(ISD::SMIN, dl, VT, Op, DAG.getNode(ISD::SUB, dl, VT, Zero, Op)); } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 3aec7049e0cc..8240a1fd7e2f 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -141,6 +141,7 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) { setLibcallName(RTLIB::EXP10_F128, "exp10f128"); setLibcallName(RTLIB::SIN_F128, "sinf128"); setLibcallName(RTLIB::COS_F128, "cosf128"); + setLibcallName(RTLIB::TAN_F128, "tanf128"); setLibcallName(RTLIB::SINCOS_F128, "sincosf128"); setLibcallName(RTLIB::POW_F128, "powf128"); setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite"); @@ -1015,7 +1016,8 @@ void TargetLoweringBase::initActions() { setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, ISD::LROUND, - ISD::LLROUND, ISD::LRINT, ISD::LLRINT, ISD::FROUNDEVEN}, + ISD::LLROUND, ISD::LRINT, ISD::LLRINT, ISD::FROUNDEVEN, + ISD::FTAN}, {MVT::f32, MVT::f64, MVT::f128}, Expand); // Default ISD::TRAP to expand (which turns it into abort). diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 004622061120..f44a6a472cb6 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -1183,8 +1183,7 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID, StubMap::const_iterator i = Stubs.find(Value); if (i != Stubs.end()) { resolveRelocation(Section, Offset, - (uint64_t)Section.getAddressWithOffset(i->second), - RelType, 0); + Section.getLoadAddressWithOffset(i->second), RelType, 0); LLVM_DEBUG(dbgs() << " Stub function found\n"); } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) { // Create a new stub function. @@ -1217,8 +1216,7 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID, addRelocationForSection(REmovk_g0, Value.SectionID); } resolveRelocation(Section, Offset, - reinterpret_cast<uint64_t>(Section.getAddressWithOffset( - Section.getStubOffset())), + Section.getLoadAddressWithOffset(Section.getStubOffset()), RelType, 0); Section.advanceStubOffset(getMaxStubSize()); } @@ -1349,10 +1347,9 @@ RuntimeDyldELF::processRelocationRef( // Look for an existing stub. StubMap::const_iterator i = Stubs.find(Value); if (i != Stubs.end()) { - resolveRelocation( - Section, Offset, - reinterpret_cast<uint64_t>(Section.getAddressWithOffset(i->second)), - RelType, 0); + resolveRelocation(Section, Offset, + Section.getLoadAddressWithOffset(i->second), RelType, + 0); LLVM_DEBUG(dbgs() << " Stub function found\n"); } else { // Create a new stub function. @@ -1367,10 +1364,10 @@ RuntimeDyldELF::processRelocationRef( else addRelocationForSection(RE, Value.SectionID); - resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>( - Section.getAddressWithOffset( - Section.getStubOffset())), - RelType, 0); + resolveRelocation( + Section, Offset, + Section.getLoadAddressWithOffset(Section.getStubOffset()), RelType, + 0); Section.advanceStubOffset(getMaxStubSize()); } } else { @@ -1609,8 +1606,7 @@ RuntimeDyldELF::processRelocationRef( if (i != Stubs.end()) { // Symbol function stub already created, just relocate to it resolveRelocation(Section, Offset, - reinterpret_cast<uint64_t>( - Section.getAddressWithOffset(i->second)), + Section.getLoadAddressWithOffset(i->second), RelType, 0); LLVM_DEBUG(dbgs() << " Stub function found\n"); } else { @@ -1652,10 +1648,10 @@ RuntimeDyldELF::processRelocationRef( addRelocationForSection(REl, Value.SectionID); } - resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>( - Section.getAddressWithOffset( - Section.getStubOffset())), - RelType, 0); + resolveRelocation( + Section, Offset, + Section.getLoadAddressWithOffset(Section.getStubOffset()), + RelType, 0); Section.advanceStubOffset(getMaxStubSize()); } if (IsExtern || (AbiVariant == 2 && Value.SectionID != SectionID)) { diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 2c4b45255d05..92213e19c9d9 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3961,7 +3961,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns); // Loop is not unrollable if the loop contains certain instructions. - if (!UCE.canUnroll() || UCE.Convergent) { + if (!UCE.canUnroll()) { LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n"); return 1; } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 7a5f18fe2cbd..0bf8be9ac55f 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -326,6 +326,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: Out << "aarch64_sme_preservemost_from_x0"; break; + case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1: + Out << "aarch64_sme_preservemost_from_x1"; + break; case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: Out << "aarch64_sme_preservemost_from_x2"; break; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index a7ed2de6e8a5..2f4b8351e747 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -5368,8 +5368,8 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { return DL.empty() ? std::string("G1") : (DL + "-G1").str(); } - if (T.isRISCV64()) { - // Make i32 a native type for 64-bit RISC-V. + if (T.isLoongArch64() || T.isRISCV64()) { + // Make i32 a native type for 64-bit LoongArch and RISC-V. auto I = DL.find("-n64-"); if (I != StringRef::npos) return (DL.take_front(I) + "-n32:64-" + DL.drop_front(I + 5)).str(); diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index 985f9351f4a3..788e92f94b26 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -877,7 +877,7 @@ void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports, break; case wasm::WASM_EXTERNAL_TABLE: W->OS << char(Import.Table.ElemType); - encodeULEB128(0, W->OS); // flags + encodeULEB128(Import.Table.Limits.Flags, W->OS); encodeULEB128(NumElements, W->OS); // initial break; case wasm::WASM_EXTERNAL_TAG: @@ -1022,7 +1022,8 @@ void WasmObjectWriter::writeElemSection( encodeULEB128(TableNumber, W->OS); // the table number // init expr for starting offset - W->OS << char(wasm::WASM_OPCODE_I32_CONST); + W->OS << char(is64Bit() ? wasm::WASM_OPCODE_I64_CONST + : wasm::WASM_OPCODE_I32_CONST); encodeSLEB128(InitialTableOffset, W->OS); W->OS << char(wasm::WASM_OPCODE_END); diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 2b6bdbf24afa..cbc55a145e0e 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -586,6 +586,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const { return "gfx1150"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: return "gfx1151"; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: + return "gfx1152"; // AMDGCN GFX12. case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 8e2a9481c922..0fee299994bc 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -611,6 +611,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1103, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1150, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1151, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1152, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 316d05bf1dc3..8dd060d0151a 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -139,6 +139,7 @@ #include "llvm/Transforms/IPO/DeadArgumentElimination.h" #include "llvm/Transforms/IPO/ElimAvailExtern.h" #include "llvm/Transforms/IPO/EmbedBitcodePass.h" +#include "llvm/Transforms/IPO/ExpandVariadics.h" #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" #include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/Transforms/IPO/FunctionImport.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 50682ca4970f..dad97146a9f6 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -59,6 +59,7 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass()) MODULE_PASS("dxil-upgrade", DXILUpgradePass()) MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass()) MODULE_PASS("extract-blocks", BlockExtractorPass({}, false)) +MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::Disable)) MODULE_PASS("forceattrs", ForceFunctionAttrsPass()) MODULE_PASS("function-import", FunctionImportPass()) MODULE_PASS("globalopt", GlobalOptPass()) diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp index fc3be716087e..693897f874a2 100644 --- a/llvm/lib/ProfileData/MemProfReader.cpp +++ b/llvm/lib/ProfileData/MemProfReader.cpp @@ -690,7 +690,7 @@ Error RawMemProfReader::readNextRecord( return F; auto Iter = this->GuidToSymbolName.find(F.Function); assert(Iter != this->GuidToSymbolName.end()); - F.SymbolName = Iter->getSecond(); + F.SymbolName = std::make_unique<std::string>(Iter->getSecond()); return F; }; return MemProfReader::readNextRecord(GuidRecord, IdToFrameCallback); diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index fcefdef992be..7360901f2962 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -867,21 +867,16 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, // Any intermediate directories we create should be accessible by // the owner, even if Perms says otherwise for the final path. const auto NewDirectoryPerms = ResolvedPerms | sys::fs::owner_all; + + StringRef Name = *I; while (true) { - StringRef Name = *I; - detail::InMemoryNode *Node = Dir->getChild(Name); + Name = *I; ++I; + if (I == E) + break; + detail::InMemoryNode *Node = Dir->getChild(Name); if (!Node) { - if (I == E) { - // End of the path. - Dir->addChild( - Name, MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime, - std::move(Buffer), ResolvedUser, ResolvedGroup, - ResolvedType, ResolvedPerms})); - return true; - } - - // Create a new directory. Use the path up to here. + // This isn't the last element, so we create a new directory. Status Stat( StringRef(Path.str().begin(), Name.end() - Path.str().begin()), getDirectoryID(Dir->getUniqueID(), Name), @@ -891,27 +886,33 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, Name, std::make_unique<detail::InMemoryDirectory>(std::move(Stat)))); continue; } + // Creating file under another file. + if (!isa<detail::InMemoryDirectory>(Node)) + return false; + Dir = cast<detail::InMemoryDirectory>(Node); + } + detail::InMemoryNode *Node = Dir->getChild(Name); + if (!Node) { + Dir->addChild(Name, + MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime, + std::move(Buffer), ResolvedUser, ResolvedGroup, + ResolvedType, ResolvedPerms})); + return true; + } + if (isa<detail::InMemoryDirectory>(Node)) + return ResolvedType == sys::fs::file_type::directory_file; - if (auto *NewDir = dyn_cast<detail::InMemoryDirectory>(Node)) { - Dir = NewDir; - } else { - assert((isa<detail::InMemoryFile>(Node) || - isa<detail::InMemoryHardLink>(Node)) && - "Must be either file, hardlink or directory!"); - - // Trying to insert a directory in place of a file. - if (I != E) - return false; + assert((isa<detail::InMemoryFile>(Node) || + isa<detail::InMemoryHardLink>(Node)) && + "Must be either file, hardlink or directory!"); - // Return false only if the new file is different from the existing one. - if (auto Link = dyn_cast<detail::InMemoryHardLink>(Node)) { - return Link->getResolvedFile().getBuffer()->getBuffer() == - Buffer->getBuffer(); - } - return cast<detail::InMemoryFile>(Node)->getBuffer()->getBuffer() == - Buffer->getBuffer(); - } + // Return false only if the new file is different from the existing one. + if (auto *Link = dyn_cast<detail::InMemoryHardLink>(Node)) { + return Link->getResolvedFile().getBuffer()->getBuffer() == + Buffer->getBuffer(); } + return cast<detail::InMemoryFile>(Node)->getBuffer()->getBuffer() == + Buffer->getBuffer(); } bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 4b2ce0d73949..5708b6173750 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -85,6 +85,10 @@ def SMEUnsupported : AArch64Unsupported { SME2Unsupported.F); } +def MTEUnsupported : AArch64Unsupported { + let F = [HasMTE]; +} + let F = [HasPAuth, HasPAuthLR] in def PAUnsupported : AArch64Unsupported; @@ -109,6 +113,7 @@ include "AArch64SchedNeoverseN1.td" include "AArch64SchedNeoverseN2.td" include "AArch64SchedNeoverseV1.td" include "AArch64SchedNeoverseV2.td" +include "AArch64SchedOryon.td" include "AArch64Processors.td" diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 32646c6ee689..941990c53c4a 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -589,6 +589,14 @@ def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 (sequence "X%u",19, 28), LR, FP)>; +// SME ABI support routines such as __arm_get_current_vg preserve most registers. +def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 + : CalleeSavedRegs<(add (sequence "Z%u", 0, 31), + (sequence "P%u", 0, 15), + (sequence "X%u", 1, 15), + (sequence "X%u",19, 28), + LR, FP)>; + // SME ABI support routines __arm_sme_state preserves most registers. def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 : CalleeSavedRegs<(add (sequence "Z%u", 0, 31), diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 8d16709114df..a759efcd9441 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -617,6 +617,27 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B", FeatureLdpAlignedOnly, FeatureStpAlignedOnly]>; +def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily", + "Oryon", + "Nuvia Inc Oryon processors", [ + FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeatureFuseAES, + FeatureFuseAdrpAdd, + FeatureEnableSelectOptimize, + FeatureFuseCryptoEOR, + FeatureFuseAddress, + FeatureSM4, + FeatureSHA2, + FeatureSHA3, + FeatureAES, + FeatureFullFP16, + FeatureFP16FML, + FeaturePerfMon, + FeatureSPE, + FeaturePostRAScheduler, + HasV8_6aOps]>; def ProcessorFeatures { list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, @@ -806,6 +827,11 @@ def ProcessorFeatures { FeatureSHA3, FeatureAES, FeatureCSSC, FeatureWFxT, FeatureFullFP16]; + list<SubtargetFeature> Oryon = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, + FeatureCrypto, FeatureRandGen, + FeaturePAuth, FeatureSM4, FeatureSHA2, + FeatureSHA3, FeatureAES]; + // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not // affect code generated by the compiler and can be used only by explicitly @@ -988,3 +1014,7 @@ def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A, def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B, [TuneAmpere1B]>; + +// Qualcomm Oryon +def : ProcessorModel<"oryon-1", OryonModel, ProcessorFeatures.Oryon, + [TuneOryon]>; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index e97d7e3b6ed8..cc50b59dd8d7 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -107,13 +107,22 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0) report_fatal_error( - "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is " - "only supported to improve calls to SME ACLE save/restore/disable-za " + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is only " + "supported to improve calls to SME ACLE save/restore/disable-za " "functions, and is not intended to be used beyond that scope."); if (MF->getFunction().getCallingConv() == + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1) + report_fatal_error( + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 is " + "only supported to improve calls to SME ACLE __arm_get_current_vg " + "function, and is not intended to be used beyond that scope."); + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2) report_fatal_error( - "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is " + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is " "only supported to improve calls to SME ACLE __arm_sme_state " "and is not intended to be used beyond that scope."); if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering() @@ -153,13 +162,22 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0) report_fatal_error( - "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is " + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is " "only supported to improve calls to SME ACLE save/restore/disable-za " "functions, and is not intended to be used beyond that scope."); if (MF->getFunction().getCallingConv() == + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1) + report_fatal_error( + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 is " + "only supported to improve calls to SME ACLE __arm_get_current_vg " + "function, and is not intended to be used beyond that scope."); + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2) report_fatal_error( - "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is " + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is " "only supported to improve calls to SME ACLE __arm_sme_state " "and is not intended to be used beyond that scope."); if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) @@ -236,6 +254,8 @@ AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF, "Calling convention SVE_VectorCall is unsupported on Darwin."); if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0) return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0_RegMask; + if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1) + return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1_RegMask; if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2) return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2_RegMask; if (CC == CallingConv::CFGuard_Check) @@ -282,6 +302,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, : CSR_AArch64_SVE_AAPCS_RegMask; if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0) return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0_RegMask; + if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1) + return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1_RegMask; if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2) return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2_RegMask; if (CC == CallingConv::CFGuard_Check) @@ -643,6 +665,7 @@ bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF, case CallingConv::AArch64_VectorCall: case CallingConv::AArch64_SVE_VectorCall: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: + case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: if (STI.isTargetWindows()) return HasReg(CC_AArch64_Win64PCS_ArgRegs, Reg); diff --git a/llvm/lib/Target/AArch64/AArch64SchedOryon.td b/llvm/lib/Target/AArch64/AArch64SchedOryon.td new file mode 100644 index 000000000000..09d1af248f0e --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedOryon.td @@ -0,0 +1,1659 @@ +//=- AArch64SchedOryon.td - Qualcomm Oryon CPU 001 ---*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for Qualcomm Oryon +// family of processors. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pipeline Description. + +def OryonModel : SchedMachineModel { + let IssueWidth = 14; + let MicroOpBufferSize = 376; + let LoadLatency = 4; + let MispredictPenalty = 13; // 13 cycles for mispredicted branch. + let LoopMicroOpBufferSize = 0; // Do not have a LoopMicroOpBuffer + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + SMEUnsupported.F, + MTEUnsupported.F, + PAUnsupported.F, + [HasPAuth, HasCSSC]); +} + +let SchedModel = OryonModel in { + +// Issue ports. +// IXU has 6 ports p0 ~ p5 +// LSU has 4 ports p6 ~ p9(ls0 ~ ls3), p10/p11(std0, std1) has to work with ls0~ls3 +// VXU has 4 ports p12 ~ p15 + +// cross IXU/LSU/VXU resource group for FMOV P41 of VXU +// I2V +def ORYONI4FP0 : ProcResource<1>; +def ORYONI5FP1 : ProcResource<1>; +// V2I +def ORYONFP0I4 : ProcResource<1>; +def ORYONFP1I5 : ProcResource<1>; + +// store 1 for normal store instructions +def ORYONST0 : ProcResource<1>; +// store 2 for normal store instructions +def ORYONST1 : ProcResource<1>; + +// Port 0: ALU/Indirect/Direct Branch. +def ORYONP0 : ProcResource<1>; + +// Port 1: ALU/Direct Branch. +def ORYONP1 : ProcResource<1>; + +// Port 2: ALU. +def ORYONP2 : ProcResource<1>; + +// Port 3: ALU. +def ORYONP3 : ProcResource<1>; + +// Port 4: ALU. +def ORYONP4 : ProcResource<1> { + let Super = ORYONI4FP0; + let Super = ORYONFP0I4; } + +// Port 5: ALU. +def ORYONP5 : ProcResource<1> { + let Super = ORYONI5FP1; + let Super = ORYONFP1I5; } + +// Port 6: Load/Store. LS0 +def ORYONP6 : ProcResource<1> { + let Super = ORYONST0; } + +// Port 7: Load/store. LS1 +def ORYONP7 : ProcResource<1> { + let Super = ORYONST0; } + +// Port 8: Load/Store. LS2 +def ORYONP8 : ProcResource<1> { + let Super = ORYONST1; } + +// Port 9: Load/store. LS3 +def ORYONP9 : ProcResource<1> { + let Super = ORYONST1; } + +// Port 10: Load/Store. STD0 +def ORYONP10SD0 : ProcResource<1> { + let Super = ORYONST0; } + +// Port 11: Load/store. STD1 +def ORYONP11SD1 : ProcResource<1> { + let Super = ORYONST1; } + +// Port 12: FP/Neon/SIMD/Crypto. +def ORYONP12FP0 : ProcResource<1> { + let Super = ORYONI4FP0; + let Super = ORYONFP0I4; } + +// Port 13: FP/Neon/SIMD/Crypto. +def ORYONP13FP1 : ProcResource<1> { + let Super = ORYONI5FP1; + let Super = ORYONFP1I5; } + +// Port 14: FP/Neon/SIMD/Crypto. +def ORYONP14FP2 : ProcResource<1>; + +// Port 15: FP/Neon/SIMD/Crypto. +def ORYONP15FP3 : ProcResource<1>; + +// Define groups for the functional units on each issue port. Each group +// created will be used by a WriteRes. + +// Integer add/shift/logical/misc. instructions on port I0/I1/I2/I3/I4/I5. +def ORYONI012345 : ProcResGroup<[ORYONP0, ORYONP1, ORYONP2, + ORYONP3, ORYONP4, ORYONP5]> { + let BufferSize = 120; +} + +// Direct Conditional Branch instructions on ports I0/I1. +def ORYONI01 : ProcResGroup<[ORYONP0, ORYONP1]> { + let BufferSize = 40; +} + +// Indirect/crypto Conditional Branch instructions on ports I0. +def ORYONI0 : ProcResGroup<[ORYONP0]> { + let BufferSize = 20; +} + +// Crypto/CRC/PAU instructions on ports I2. +def ORYONI2 : ProcResGroup<[ORYONP2]> { + let BufferSize = 20; +} + +// Multiply/Multiply-ADD instructions on ports I4/I5. +def ORYONI45 : ProcResGroup<[ORYONP4, ORYONP5]> { + let BufferSize = 40; +} + +// Divide instructions on ports I5. +def ORYONI5 : ProcResGroup<[ORYONP5]> { + let BufferSize = 20; +} + +// Comparison instructions on ports I0/I1/I2/I3. +def ORYONI0123 : ProcResGroup<[ORYONP0, ORYONP1, + ORYONP2, ORYONP3]> { + let BufferSize = 80; +} + +// Load instructions on ports P6/P7/P8/P9. +def ORYONLD : ProcResGroup<[ORYONP6, ORYONP7, ORYONP8, ORYONP9]> { + let BufferSize = 64; +} + +// Store instructions on combo of STA/STD pipes +def ORYONST : ProcResGroup<[ORYONST0, ORYONST1]> { + let BufferSize = 64; +} + +// Arithmetic and CRYP-AED ASIMD/FP instructions on ports FP0/FP1/FP2/FP3. +def ORYONFP0123 : ProcResGroup<[ORYONP12FP0, ORYONP13FP1, + ORYONP14FP2, ORYONP15FP3]> { + let BufferSize = 192; +} + +// FP Comparison and F/I move instructions on ports FP0/FP1. +def ORYONFP01 : ProcResGroup<[ORYONP12FP0, ORYONP13FP1]> { + let BufferSize = 96; +} + +// FDIV instructions on ports FP3. +def ORYONFP3 : ProcResGroup<[ORYONP15FP3]> { + let BufferSize = 48; +} + +// CRYP-SHA instructions on ports FP1. +def ORYONFP1 : ProcResGroup<[ORYONP14FP2]> { + let BufferSize = 48; +} + +def ORYONFP2 : ProcResGroup<[ORYONP14FP2]> { + let BufferSize = 48; +} + +// Reciprocal, Squre root on FP0. +def ORYONFP0 : ProcResGroup<[ORYONP12FP0]> { + let BufferSize = 48; +} + +// cross IXU/LSU/VXU resource group for FMOV P41 of VXU +// I2V +def ORYONI2V : ProcResGroup<[ORYONI4FP0, ORYONI5FP1]> { + let BufferSize = 40; +} + +// V2I +def ORYONV2I : ProcResGroup<[ORYONFP0I4, ORYONFP1I5]> { + let BufferSize = 96; +} + +// Define commonly used write types for InstRW specializations. +// All definitions follow the format: ORYONWrite_<NumCycles>Cyc_<Resources>. + +// Because of the complexity of Oryon CPU, we skip the following +// generic definitions and define each instruction specifically + +// These WriteRes entries are not used in the Falkor sched model. +def : WriteRes<WriteImm, []> { let Unsupported = 1; } +def : WriteRes<WriteI, []> { let Unsupported = 1; } +def : WriteRes<WriteISReg, []> { let Unsupported = 1; } +def : WriteRes<WriteIEReg, []> { let Unsupported = 1; } +def : WriteRes<WriteExtr, []> { let Unsupported = 1; } +def : WriteRes<WriteIS, []> { let Unsupported = 1; } +def : WriteRes<WriteID32, []> { let Unsupported = 1; } +def : WriteRes<WriteID64, []> { let Unsupported = 1; } +def : WriteRes<WriteIM32, []> { let Unsupported = 1; } +def : WriteRes<WriteIM64, []> { let Unsupported = 1; } +def : WriteRes<WriteBr, []> { let Unsupported = 1; } +def : WriteRes<WriteBrReg, []> { let Unsupported = 1; } +def : WriteRes<WriteLD, []> { let Unsupported = 1; } +def : WriteRes<WriteST, []> { let Unsupported = 1; } +def : WriteRes<WriteSTP, []> { let Unsupported = 1; } +def : WriteRes<WriteAdr, []> { let Unsupported = 1; } +def : WriteRes<WriteLDIdx, []> { let Unsupported = 1; } +def : WriteRes<WriteSTIdx, []> { let Unsupported = 1; } +def : WriteRes<WriteF, []> { let Unsupported = 1; } +def : WriteRes<WriteFCmp, []> { let Unsupported = 1; } +def : WriteRes<WriteFCvt, []> { let Unsupported = 1; } +def : WriteRes<WriteFCopy, []> { let Unsupported = 1; } +def : WriteRes<WriteFImm, []> { let Unsupported = 1; } +def : WriteRes<WriteFMul, []> { let Unsupported = 1; } +def : WriteRes<WriteFDiv, []> { let Unsupported = 1; } +def : WriteRes<WriteVd, []> { let Unsupported = 1; } +def : WriteRes<WriteVq, []> { let Unsupported = 1; } +def : WriteRes<WriteVLD, []> { let Unsupported = 1; } +def : WriteRes<WriteVST, []> { let Unsupported = 1; } +def : WriteRes<WriteSys, []> { let Unsupported = 1; } +def : WriteRes<WriteBarrier, []> { let Unsupported = 1; } +def : WriteRes<WriteHint, []> { let Unsupported = 1; } +def : WriteRes<WriteLDHi, []> { let Unsupported = 1; } +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// These ReadAdvance entries will be defined in later implementation +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; +def : ReadAdvance<ReadST, 0>; + + +//IXU resource definition +// 1 cycles NO pipe +def ORYONWrite_1Cyc_NONE : SchedWriteRes<[]>; + +// 1 cycles on I01. +def ORYONWrite_1Cyc_I01 : SchedWriteRes<[ORYONI01]>; + +def ORYONWrite_1Cyc_2Uops_I01 : SchedWriteRes<[ORYONI01]> { + let NumMicroOps = 2; +} + +def ORYONWrite_1Cyc_I0 : SchedWriteRes<[ORYONI0]>; + +// 7 cycles on I2. PAC*/AUT* instructions +def ORYONWrite_7Cyc_I2 : SchedWriteRes<[ORYONI2]> { + let Latency = 7; +} + +// 7 cycles on I2. PAC*/AUT* instructions +def ORYONWrite_7Cyc_3Uops_I2 : SchedWriteRes<[ORYONI2]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 9 (7+1+1) cycles on I2 and I0/I1, I0. Authentication branch instructions +// these instructions are broken down to three uops +// a. PtrAuth on pipe 2 taking 7 cycles +// b. Link Register Update on pipes 0 and 1 taking 1 cycle +// c. Indirect branch on pipe 0 taking 1 cycle + +def ORYONWrite_9Cyc_I012 : SchedWriteRes<[ORYONI2, ORYONI01]> { + let Latency = 9; + let NumMicroOps = 3; +} + +// 3 cycles on I2. CRC32 and CRC32C instructions +def ORYONWrite_3Cyc_I2 : SchedWriteRes<[ORYONI2]> { + let Latency = 3; +} + +// 1 cycle on I012345 +def ORYONWrite_1Cyc_I012345 : SchedWriteRes<[ORYONI012345]>; + +// 1 cycle on I0123 +def ORYONWrite_1Cyc_I0123 : SchedWriteRes<[ORYONI0123]>; + +// 1 cycle on 2 of I012345 +def ORYONWrite_1Cyc_I012345_I012345 : +SchedWriteRes<[ORYONI012345, ORYONI012345]> ; + +// 2 cycle on 2 of I0123 with ReleaseAtCycles +def ORYONWrite_2Cyc_I0123_I0123_RC : +SchedWriteRes<[ORYONI0123, ORYONI0123]> { + let Latency = 2; + let ReleaseAtCycles = [2,2]; +} + +// 2 cycle on 2 of I012345 +def ORYONWrite_2Cyc_I012345_I012345_RC : +SchedWriteRes<[ORYONI012345, ORYONI012345]> { + let Latency = 2; + let ReleaseAtCycles = [2,2]; +} + +// 3 cycle on 2 of I45 +def ORYONWrite_3Cyc_I45_I45_RC : +SchedWriteRes<[ORYONI45, ORYONI45]> { + let Latency = 3; + let ReleaseAtCycles = [2,2]; +} + +// 3 cycle on I45 +def ORYONWrite_3Cyc_I45 : SchedWriteRes<[ORYONI45]> { + let Latency = 3; +} + +// 7 cycle on I2 32-bit integer division +def ORYONWrite_7Cyc_I2_RC : SchedWriteRes<[ORYONI2]> { + let Latency = 7; + let ReleaseAtCycles = [2]; +} + +// 9 cycle on I2 64-bit integer division +def ORYONWrite_9Cyc_I2_RC : SchedWriteRes<[ORYONI2]> { + let Latency = 9; + let ReleaseAtCycles = [2]; +} + +// LSU resource definition +// need to define WriteLDAdr, WriteAdrAdr, WriteLDHi, WriteSTX +// 4 cycle on LS(P6789) +def ORYONWrite_4Cyc_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 4; +} + +// 4 cycle for Post/Pre inc/dec access, also covers all pair loads Post/Pre +def ORYONWrite_4Cyc_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 4; +} + +// 5 (4+1) for VXU SIMD access/could also include FP +// resource might not be correct, as VXU resource not included +def ORYONWrite_5Cyc_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; +} + +def ORYONWrite_5Cyc_2Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def ORYONWrite_5Cyc_3Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def ORYONWrite_5Cyc_4Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def ORYONWrite_5Cyc_5Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 5; +} + +def ORYONWrite_5Cyc_6Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def ORYONWrite_5Cyc_8Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def ORYONWrite_5Cyc_10Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 10; +} + +// 6 cycle for Post/Pre inc/dec access +def ORYONWrite_5Cyc_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; +} + +def ORYONWrite_5Cyc_2Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def ORYONWrite_5Cyc_3Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def ORYONWrite_5Cyc_4Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def ORYONWrite_5Cyc_5Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 5; +} + +def ORYONWrite_5Cyc_6Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def ORYONWrite_5Cyc_8Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def ORYONWrite_5Cyc_10Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 10; +} + +// 1 cycle for all generic stores +def ORYONWrite_1Cyc_ST : SchedWriteRes<[ORYONST]>; + +def ORYONWrite_1Cyc_2Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 2; +} + +def ORYONWrite_1Cyc_3Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 3; +} + +def ORYONWrite_1Cyc_4Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 4; +} + +def ORYONWrite_1Cyc_5Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 5; +} + +def ORYONWrite_1Cyc_6Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 6; +} + +def ORYONWrite_1Cyc_8Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 8; +} + +def ORYONWrite_1Cyc_10Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 10; +} + +// 1 cycle for neon write: float + ASIMD with Post/Pre Inc/Dec access +// also includes Pair store until further informed +def ORYONWrite_1Cyc_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 3; +} + +def ORYONWrite_1Cyc_2Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 2; +} + +def ORYONWrite_1Cyc_3Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 3; +} + +def ORYONWrite_1Cyc_4Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 4; +} + +def ORYONWrite_1Cyc_5Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 5; +} + +def ORYONWrite_1Cyc_6Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 6; +} + +def ORYONWrite_1Cyc_8Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 8; +} + +def ORYONWrite_1Cyc_10Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 10; +} + +// VXU resource definition + +// I2V instruction has 1 uOp +// I2v with convert has 2 uOps +// all I2V, V2I's throughputs are 2 +// On VXU doc, p37 -- latencies and throughput +// P41, resource taken, P42, uOps +def ORYONWrite_I2V_4Cyc_I45 : SchedWriteRes<[ORYONI2V]> { + let Latency = 4; +} + +// inline a FCVT, so add one more uOp +def ORYONWrite_I2V_7Cyc_I45 : SchedWriteRes<[ORYONI2V]> { + let Latency = 7; + let NumMicroOps = 2; +} + +// V2I move instruction has 1/2 uOps, P42 in VXU doc +// Latency is 3, FCVT is also 3 cycle +// move + convert is 6 (3+3) cycles +// throughput is 2 +def ORYONWrite_V2I_3Cyc_FP01 : SchedWriteRes<[ORYONV2I]> { + let Latency = 3; +} + +// inline a FCVT, so add one more uOp +def ORYONWrite_V2I_6Cyc_FP01 : SchedWriteRes<[ORYONV2I]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def ORYONWrite_V2V_2Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 2; +} + +def ORYONWrite_V2V_3Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 3; +} + +def ORYONWrite_V2V_6Cyc_FP01 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def ORYONWrite_4Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 4; +} + +def ORYONWrite_3Cyc_FP0 : SchedWriteRes<[ORYONFP0]> { + let Latency = 3; +} + +def ORYONWrite_3Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 3; +} + +def ORYONWrite_3Cyc_2Uops_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def ORYONWrite_2Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 2; +} + +def ORYONWrite_2Cyc_FP01 : SchedWriteRes<[ORYONFP01]> { + let Latency = 2; +} + +// 2 cycle on FP1 +def ORYONWrite_2Cyc_FP1 : SchedWriteRes<[ORYONFP1]> { + let Latency = 2; +} + +// 3 cycle on FP1 +def ORYONWrite_3Cyc_FP1 : SchedWriteRes<[ORYONFP1]> { + let Latency = 3; +} + +// 4 cycle , 0.5 throughput on FP1 +def ORYONWrite_4Cyc_FP1_RC4 : SchedWriteRes<[ORYONFP1]> { + let Latency = 4; + let ReleaseAtCycles = [4]; +} + +// 5 cycle , 1 throughput on FP1 +def ORYONWrite_5Cyc_FP1 : SchedWriteRes<[ORYONFP1]> { + let Latency = 5; +} + +// 8 cycle , 2 throughput on FP0123 +def ORYONWrite_8Cyc_FP0123_RC : SchedWriteRes<[ORYONFP0123]> { + let Latency = 8; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_6Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 6; +} + +def ORYONWrite_7Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 7; +} + +def ORYONWrite_8Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 8; +} + +def ORYONWrite_9Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 9; +} + +def ORYONWrite_10Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 10; +} + +def ORYONWrite_8Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> { + let Latency = 8; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_10Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> { + let Latency = 10; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_13Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> { + let Latency = 13; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_4Cyc_FP0123_RC : +SchedWriteRes<[ORYONFP0123]> { + let Latency = 4; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_4Cyc_FP0123_FP0123_RC : +SchedWriteRes<[ORYONFP0123, ORYONFP0123]> { + let Latency = 4; + let NumMicroOps = 2; + let ReleaseAtCycles = [2,2]; +} + +def ORYONWrite_4Cyc_FP0123_FP0123_FP0123_RC : +SchedWriteRes<[ORYONFP0123, ORYONFP0123, ORYONFP0123]> { + let Latency = 4; + let NumMicroOps = 3; + let ReleaseAtCycles = [3,3,3]; +} + +def ORYONWrite_6Cyc_FP0123_FP0123_FP0123_FP0123_RC : +SchedWriteRes<[ORYONFP0123, ORYONFP0123, ORYONFP0123, ORYONFP0123]> { + let Latency = 6; + let NumMicroOps = 4; + let ReleaseAtCycles = [6,6,6,6]; +} + +//===----------------------------------------------------------------------===// +// Instruction Tables in IXU +//===----------------------------------------------------------------------===// + +//--- +// Arithmetic Instructions +//--- + +//1, 1, 6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^ADD(W|X)r(i|r|x)", "^SUB(W|X)r(i|r|x)")>; + +//2,2,3 +def : InstRW<[ORYONWrite_2Cyc_I012345_I012345_RC], + (instregex "^ADD(W|X)rs", "^SUB(W|X)rs")>; + +//1,1,4 alias CMP, CMN on page 75 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^ADDS(W|X)r(i|r|x)(64)?", "^SUBS(W|X)r(i|r|x)")>; + +//2,2,2 alias CMP, CMN on page 75 +def : InstRW<[ORYONWrite_2Cyc_I0123_I0123_RC], + (instregex "^ADDS(W|X)rs", "^SUBS(W|X)rs")>; + +//1,1,4 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^ADC(W|X)r","^SBC(W|X)r", + "^ADCS(W|X)r","^SBCS(W|X)r")>; + +//1,1,2 +def : InstRW<[ORYONWrite_1Cyc_2Uops_I01], + (instrs ADR,ADRP)>; + +//1,1,4 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^CSEL(W|X)r", "^CSINV(W|X)r", + "^CSNEG(W|X)r", "^CSINC(W|X)r")>; + +//--- +//Compare Instruciton +//--- + +// We have CCMP, CCMN as LLVM DAG node +// CMP is an alias of SUBS as above +// CMN is an alias of ADDS as above +// We also have no way to get shift compare node in LLVM +//2,2,1.5 CMP, CMN + +//1,1,4 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^CCMP(W|X)(i|r)", "^CCMN(W|X)(i|r)")>; + +//--- +// Branch +//--- + +def : InstRW<[ORYONWrite_1Cyc_NONE], (instrs B)>; +def : InstRW<[ORYONWrite_1Cyc_I01], (instrs BL)>; +def : InstRW<[ORYONWrite_1Cyc_I01], + (instrs Bcc, CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; +def : InstRW<[ORYONWrite_1Cyc_I0], (instrs BR, BLR)>; +def : InstRW<[ORYONWrite_1Cyc_I0], (instrs RET)>; + +// 3 uOp, 1 cycle for branch, 7 cycle for Authentication, +// 1 cycle for updating link register +// V8.3a PAC +def : InstRW<[ORYONWrite_9Cyc_I012], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, + BRAA, BRAAZ, BRAB, BRABZ)>; +def : InstRW<[ORYONWrite_9Cyc_I012], (instrs RETAA, RETAB, ERETAA, ERETAB)>; + +def : InstRW<[ORYONWrite_7Cyc_3Uops_I2], (instregex "^LDRAA", "^LDRAB")>; + +// Logical Instructions +//--- + +//1,1,4 TST is an alias of ANDS +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^ANDS(W|X)r(i|r|x)", "^BICS(W|X)r(i|r|x)")>; + +//2,2,2 TST shift is an alias +def : InstRW<[ORYONWrite_2Cyc_I0123_I0123_RC], + (instregex "^ANDS(W|X)rs", "^BICS(W|X)rs")>; + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^AND(W|X)r(i|r|x)", "^EOR(W|X)r(i|r|x)", + "^ORR(W|X)r(i|r|x)", "^BIC(W|X)r(i|r|x)", + "^EON(W|X)r(i|r|x)", "^ORN(W|X)r(i|r|x)")>; + +//2,2,3 +def : InstRW<[ORYONWrite_2Cyc_I012345_I012345_RC], + (instregex "^AND(W|X)rs", "^EOR(W|X)rs", "^ORR(W|X)rs", + "^BIC(W|X)rs", "^EON(W|X)rs", "^ORN(W|X)rs")>; + + +//--- +// Shift Instructions +//--- + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^ASRV(W|X)r", "^LSLV(W|X)r", + "^LSRV(W|X)r", "^RORV(W|X)r", + "RMIF")>; + +//--- +// Move-Data Bit-field and Sign_Extension Instructions +//--- + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^MOVK(W|X)i", "^MOVN(W|X)i", + "^MOVZ(W|X)i", "^SBFM(W|X)ri", + "^UBFM(W|X)ri", "^BFM(W|X)ri", + "^SXT(W|B|H|X)", "^UXT(H|B)")>; + +// COPY instruction is an LLVM internal DAG node, needs further study +def : InstRW<[ORYONWrite_1Cyc_I012345], (instrs COPY)>; + +//--- +// Reverse Instructions +//--- + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^RBIT(W|X)r", "^REV(16|32|64)?(W|X)r")>; + + +//--- +// Flag Manipulate Instructions +//--- + +//1,1,4 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^SETF8", "^SETF16", "^CFINV")>; + +//--- +// Miscellaneous Instructions +//--- + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$", "^EXTR(W|X)rri")>; + + +//--- +// Multiply Instructions +//--- + +//1,3,2 +def : InstRW<[ORYONWrite_3Cyc_I45], + (instregex "^MADD(W|X)rrr", "^MSUB(W|X)rrr", + "^(S|U)MADDLrrr", "^(S|U)MSUBLrrr", + "^(S|U)MULHrr")>; + +//--- +// Divide Instructions +//--- + +def : InstRW<[ORYONWrite_7Cyc_I2_RC], + (instregex "^(S|U)DIVWr")>; + +def : InstRW<[ORYONWrite_9Cyc_I2_RC], + (instregex "^(S|U)DIVXr")>; + + +//--- +// Cryptgraphy Instructions +// +//1,3,1 on I2 +def : InstRW<[ORYONWrite_3Cyc_I2], + (instregex "^CRC32(B|H|W|X)rr", "^CRC32C(B|H|W|X)rr")>; + +//--- +// PAU instructions +//--- + +// on p47 of IXU document, we have 7 cycles for all PAU instructions +// here we just assume all signing and pauth instructions are 7 cycles +// assume all are 7 cycles here + +// signing instrucitons +def : InstRW<[ORYONWrite_7Cyc_I2], (instrs PACIA, PACIB, + PACDA, PACDB, + PACIZA, PACIZB, + PACDZA, PACDZB, + PACGA)>; +// authentication instrucitons +def : InstRW<[ORYONWrite_7Cyc_I2], (instrs AUTIA, AUTIB, + AUTDA, AUTDB, + AUTIZA, AUTIZB, + AUTDZA, AUTDZB)>; +def : InstRW<[ORYONWrite_7Cyc_I2], (instrs XPACI, XPACD)>; + +//===----------------------------------------------------------------------===// +// Instruction Tables in LSU +//===----------------------------------------------------------------------===// + +// 4 cycle Load-to-use from L1D$ +// Neon load with 5 cycle +// 6 cycle to STA ? +// STD cycle ? +// NEON STD + 2 + +// Load Instructions +// FP Load Instructions + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPDi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPQi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPSi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPXi)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPDi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPQi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPSi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPSWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPXi)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBui)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDui)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHui)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQui)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSui)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDl)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQl)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWl)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXl)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRBi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRHi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRXi)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSBWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSBXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSHWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSHXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSWi)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPDpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPQpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPSpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPWpre)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRDpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRQpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRWpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRXpre)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBWpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBXpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBWpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBXpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHWpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHXpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHWpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHXpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBBpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBBpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHHpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHHpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPDpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPQpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPSpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPWpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPXpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRDpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRQpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRWpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRXpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHHroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHWroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHXroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXroW)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHHroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHWroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHXroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXroX)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURBi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURBBi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURDi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURHi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURHHi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURQi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSBWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSBXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSHWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSHXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSWi)>; + + + +// Store register, immed post-index +// NOTE: Handled by WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteST + +// Store pair, immed post-index, W-form +// Store pair, immed post-indx, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteSTP. + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURBi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURBBi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURDi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURHi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURHHi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURQi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURSi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURWi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURXi)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRBi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRHi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRWi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRXi)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPDi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPQi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPXi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPWi)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPDi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPQi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPXi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPWi)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRBui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRDui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRHui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRQui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRXui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRWui)>; + +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STPDpre, STPDpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STPSpre, STPSpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STPWpre, STPWpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STPXpre, STPXpost)>; + +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRBpre, STRBpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRDpre, STRDpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRHpre, STRHpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRQpre, STRQpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRSpre, STRSpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRWpre, STRWpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRXpre, STRXpost)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRBroW, STRBroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRDroW, STRDroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRHroW, STRHroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRQroW, STRQroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRSroW, STRSroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRWroW, STRWroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRXroW, STRXroX)>; + +// ASIMD Load instructions, 4 cycle access + 2 cycle NEON access +// ASIMD load, 1 element, multiple, 1 reg, D-form 1uOps +// ASIMD load, 1 element, multiple, 1 reg, Q-form 1uOps +def : InstRW<[ORYONWrite_5Cyc_LD], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_LD_I012345], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form 3 uOps +// ASIMD load, 1 element, multiple, 2 reg, Q-form 2 uOps +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], + (instregex "^LD1Twov(8b|4h|2s|1d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD], + (instregex "^LD1Twov(16b|8h|4s|2d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345], + (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form 4 uOps +// ASIMD load, 1 element, multiple, 3 reg, Q-form 3 uOps +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], + (instregex "^LD1Threev(8b|4h|2s|1d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], + (instregex "^LD1Threev(16b|8h|4s|2d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form 6 uOps +// ASIMD load, 1 element, multiple, 4 reg, Q-form 4 uOps +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], + (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], + (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S 2uOps +// ASIMD load, 1 element, one lane, D 2UOps +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD], (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S 2uOps +// ASIMD load, 1 element, all lanes, D-form, D 2uOps +// ASIMD load, 1 element, all lanes, Q-form 2uOps +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S 3 uOps +// ASIMD load, 2 element, multiple, Q-form, D 4 uOps +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], + (instregex "^LD2Twov(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], + (instregex "^LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H 3 uOps +// ASIMD load, 2 element, one lane, S 3 uOps +// ASIMD load, 2 element, one lane, D 3 uOps +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S 3 uOps +// ASIMD load, 2 element, all lanes, D-form, D 3 uOps +// ASIMD load, 2 element, all lanes, Q-form 3 uOps +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S 5 uOps +// ASIMD load, 3 element, multiple, Q-form, B/H/S 6 uOps +// ASIMD load, 3 element, multiple, Q-form, D 6 uOps +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], + (instregex "^LD3Threev(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], + (instregex "^LD3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H 4 uOps +// ASIMD load, 3 element, one lane, S 4 uOps +// ASIMD load, 3 element, one lane, D 5 uOps +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], (instregex "^LD3i(8|16|32)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], (instregex "^LD3i(64)$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD3i(8|16|32)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD3i(64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S 4 uOps +// ASIMD load, 3 element, all lanes, D-form, D 5 uOps +// ASIMD load, 3 element, all lanes, Q-form, B/H/S 4 uOps +// ASIMD load, 3 element, all lanes, Q-form, D 5 uOps +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], + (instregex "^LD3Rv(1d|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD3Rv(1d|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S 6 uOps +// ASIMD load, 4 element, multiple, Q-form, B/H/S 10 uOps +// ASIMD load, 4 element, multiple, Q-form, D 8 uOps +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], + (instregex "^LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_5Cyc_10Uops_LD], + (instregex "^LD4Fourv(16b|8h|4s)$")>; +def : InstRW<[ORYONWrite_5Cyc_8Uops_LD], + (instregex "^LD4Fourv(2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_10Uops_LD_I012345], + (instregex "^LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_8Uops_LD_I012345], + (instregex "^LD4Fourv(2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H 5 uOps +// ASIMD load, 4 element, one lane, S 5 uOps +// ASIMD load, 4 element, one lane, D 6 uOps +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], (instregex "^LD4i(8|16|32)$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], (instregex "^LD4i(64)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD4i(8|16|32)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD4i(64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S 5 uOps +// ASIMD load, 4 element, all lanes, D-form, D 6 uOps +// ASIMD load, 4 element, all lanes, Q-form, B/H/S 5 uOps +// ASIMD load, 4 element, all lanes, Q-form, D 6 uOps +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s)$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], + (instregex "^LD4Rv(1d|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD4Rv(1d|2d)_POST$")>; + +// ASIMD Store Instructions +// ASIMD store, 1 element, multiple, 1 reg, D-form 1 uOps +// ASIMD store, 1 element, multiple, 1 reg, Q-form 1 uops +def : InstRW<[ORYONWrite_1Cyc_ST], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form 2 uOps +// ASIMD store, 1 element, multiple, 2 reg, Q-form 2 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form 3 uOps +// ASIMD store, 1 element, multiple, 3 reg, Q-form 3 uOps +def : InstRW<[ORYONWrite_1Cyc_3Uops_ST], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_3Uops_ST_I012345], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form 4 uOps +// ASIMD store, 1 element, multiple, 4 reg, Q-form 4 uOps +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S 2 uOps +// ASIMD store, 1 element, one lane, D 2 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S 2 uOps +// ASIMD store, 2 element, multiple, Q-form, B/H/S 4 uOps +// ASIMD store, 2 element, multiple, Q-form, D 4 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], + (instregex "^ST2Twov(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], + (instregex "^ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S 2 uOps +// ASIMD store, 2 element, one lane, D 2 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S 4 uOps +// ASIMD store, 3 element, multiple, Q-form, B/H/S 6 uOps +// ASIMD store, 3 element, multiple, Q-form, D 6 uOps +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], + (instregex "^ST3Threev(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_1Cyc_6Uops_ST], + (instregex "^ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_6Uops_ST_I012345], + (instregex "^ST3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H 2 uOps +// ASIMD store, 3 element, one lane, S 2 uOps +// ASIMD store, 3 element, one lane, D 4 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], (instregex "^ST3i(8|16|32)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], (instregex "^ST3i(64)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST3i(8|16|32)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST3i(64)_POST$")>; + + +// ASIMD store, 4 element, multiple, D-form, B/H/S 5 uOps +// ASIMD store, 4 element, multiple, Q-form, B/H/S 10 uOps +// ASIMD store, 4 element, multiple, Q-form, D 8 uOps +def : InstRW<[ORYONWrite_1Cyc_5Uops_ST], + (instregex "^ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_1Cyc_10Uops_ST], + (instregex "^ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[ORYONWrite_1Cyc_8Uops_ST], + (instregex "^ST4Fourv(2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_5Uops_ST_I012345], + (instregex "^ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_10Uops_ST_I012345], + (instregex "^ST4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_8Uops_ST_I012345], + (instregex "^ST4Fourv(2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H 3 uOps +// ASIMD store, 4 element, one lane, S 3 uOps +// ASIMD store, 4 element, one lane, D 4 uOps +def : InstRW<[ORYONWrite_1Cyc_3Uops_ST], (instregex "^ST4i(8|16|32)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], (instregex "^ST4i(64)$")>; +def : InstRW<[ORYONWrite_1Cyc_3Uops_ST_I012345], + (instregex "^ST4i(8|16|32)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST4i(64)_POST$")>; + + +//===----------------------------------------------------------------------===// +// Instruction Tables in VXU +//===----------------------------------------------------------------------===// +// all uOps are not clearly written in the VXU document + +// I2V +def : InstRW<[ORYONWrite_I2V_4Cyc_I45], (instregex "^FMOV[HSD][WX]r", "^FMOVDXHighr")>; + +// I2V with convert +def : InstRW<[ORYONWrite_I2V_7Cyc_I45], (instregex "^[SU]CVTF[SU][XW][HSD]ri")>; + +// V2I +def : InstRW<[ORYONWrite_V2I_3Cyc_FP01], (instregex "^FMOV[WX][HSD]r", "FMOVXDHighr")>; + +// V2I with convert 2nd [SU] necessary? +def : InstRW<[ORYONWrite_V2I_6Cyc_FP01], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; + +// float to float move immediate, row 7 in big chart +def : InstRW<[ORYONWrite_V2V_2Cyc_FP0123], (instregex "^FMOV[HSD]r")>; +def : InstRW<[ORYONWrite_V2V_2Cyc_FP0123], (instregex "^FMOV[HSD]i")>; + +// float to float conversion within VXU, precision conversion +def : InstRW<[ORYONWrite_V2V_6Cyc_FP01], (instregex "^FJCVTZS")>; +def : InstRW<[ORYONWrite_V2V_3Cyc_FP0123], (instregex "^FCVT[HSD][HSD]r", + "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// floating comparison write to NZCV +def : InstRW<[ORYONWrite_2Cyc_FP01], (instregex "^FCMP(E)?[HSD]r[ir]")>; +def : InstRW<[ORYONWrite_2Cyc_FP01], (instregex "^FCCMP(E)?[HSD]rr")>; + +// floating point conditional select +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^FCSEL")>; + +// floating multiply-add +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^(F|FN)MADD", "^(F|FN)MSUB")>; + +// floating unary, cycle/throughput? xls row14 +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^F(ABS|NEG)[SD]r")>; + +//floating division/square root +def : InstRW<[ORYONWrite_7Cyc_FP3], (instregex "^FDIVHrr")>; +def : InstRW<[ORYONWrite_8Cyc_FP3], (instregex "^FDIVSrr")>; +def : InstRW<[ORYONWrite_10Cyc_FP3], (instregex "^FDIVDrr")>; + +def : InstRW<[ORYONWrite_8Cyc_FP3_RC], (instregex "^FSQRTHr")>; +def : InstRW<[ORYONWrite_10Cyc_FP3_RC], (instregex "^FSQRTSr")>; +def : InstRW<[ORYONWrite_13Cyc_FP3_RC], (instregex "^FSQRTDr")>; + +//========== +// SIMD move instructions +//========== + +// ASIMD DUP element +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^DUPv.+lane")>; +// ASIMD DUP general thoughput undecided, 3? FP0123 +// VXU doc, p42, 2 uOps +def : InstRW<[ORYONWrite_3Cyc_2Uops_FP0123], (instregex "^DUPv.+gpr")>; + +// ASIMD insert, element to element +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^INSv.+lane")>; +// ASIMD insert, gen reg 3? FP0123? +def : InstRW<[ORYONWrite_3Cyc_2Uops_FP0123], (instregex "^INSv.+gpr")>; + +// ASIMD move, FP immed +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^FMOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^[SU]MOVv")>; + +//========== +// SIMD arithmetic instructions +//========== +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDv", "^SUBv", + "^BIFv", "^BITv", "^BSLv", + "^ANDv", "^BICv", "^EORv", + "^ORRv", "^ORNv")>; + + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// floating division +def : InstRW<[ORYONWrite_6Cyc_FP3], (instregex "^FDIVv.*16$")>; +def : InstRW<[ORYONWrite_7Cyc_FP3], (instregex "^FDIVv.*32$")>; +def : InstRW<[ORYONWrite_9Cyc_FP3], (instregex "^FDIVv.*64$")>; + +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FMUL(X)?v", + "^FRECPSv", "^FRSQRTSv")>; + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^MLAv","^MLSv", "^MULv", + "^PMULv", "UABAv")>; + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "SABAv", "SABDv", + "^(SH|UH)(ADD|SUB)v", + "^S(MAX|MIN)v", + "^(SQ|UQ)(ADD|SUB)v", + "^(SQ|SQR|UQ|UQR)SHLv", + "^(SR|UR)HADDv", + "^(SR|UR)SHLv", + "^UABDv", + "^U(MAX|MIN)v")>; +// IMAX or UMAX in the above line +//========== +// SIMD compare instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^CMEQv","^CMGEv","^CMGTv", + "^CMLEv","^CMLTv", "^CMHIv", + "^CMHSv", + "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv", + "^FACGEv", "^FACGTv")>; + +//========== +// SIMD widening and narrowing arithmetic instructions +//========== +// NO need to list ADDHN2, RADDHN2, RSUBHN2 as they are not distinguished +// from ADDHN, RADDHN, RSUBHN in td file(v16i8, v8i16, v4i32). +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDHNv", + "^SUBHNv", + "^RADDHNv", + "^RSUBHNv", + "^SABD(L|L2)v", "^UABD(L|L2)v", + "^(S|U)(ADD|SUB)(L|L2|W|W2)v")>; + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^PMUL(L|L2)v","^SABA(L|L2)v", + "^(S|U|SQ)(MLA|MSL|MUL)(L|L2)v")>; + +//========== +// SIMD unary arithmetic instructions +//========== +//^MVNv is an alias of ^NOTv +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ABSv", "^CLSv","^CLZv", "^CNTv", + "^NEGv", "^NOTv", + "^RBITv", "^REV(16|32|64)v", + "^SQ(ABS|NEG)v", "^SQ(XT|XTU)(N|N2)v", + "^(SU|US)QADDv", + "^UQXT(N|N2)v", "^XTN2?v")>; + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^FCVT(L|L2|N|N2|XN|XN2)v", + "^FRINT[AIMNPXZ]v", + "^FRSQRTEv", + "^(S|U)ADALPv", + "^(S|U)ADDLPv")>; + + +def : InstRW<[ORYONWrite_3Cyc_FP0], (instregex "^URECPEv", "^URSQRTEv", + "^FRECPEv", "^FRECPXv")>; + +def : InstRW<[ORYONWrite_8Cyc_FP3_RC], (instregex "^FSQRTv.*16$")>; +def : InstRW<[ORYONWrite_10Cyc_FP3_RC], (instregex "^FSQRTv.*32$")>; +def : InstRW<[ORYONWrite_13Cyc_FP3_RC], (instregex "^FSQRTv.*64$")>; + +//========== +// SIMD binary elememt arithmetic instructions +//========== + +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FMLAv", "^FMLSv")>; + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^SQDMULHv", + "^SQRD(MLA|MLS|MUL)Hv")>; + +//========== +// SIMD permute instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^EXTv", "^TRN(1|2)v", + "^UZP(1|2)v", "^ZIP(1|2)v")>; + +//========== +// SIMD immediate instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^MOVIv", "^MVNIv")>; + +//========== +// SIMD shift(immediate) instructions +//========== +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^RSHR(N|N2)v", "^SHLv", + "^(SHL|SHR)(N|N2)v", + "^SLIv", + "^(SQ|SQR)SHR(U)?(N|N2)v", + "^(UQ|UQR)SHR(N|N2)v", + "^SQSHLUv", + "^SRIv", + "^(S|SR|U|UR)SHRv", + "^(S|SR|U|UR)SRAv", + "^(S|U)SHL(L|L2)v")>; + +//========== +// SIMD floating-point and integer conversion instructions +//========== +// same as above conversion + +//========== +// SIMD reduce (acoss vector lanes) instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDVv", + "^(FMAX|FMIN)(V|NMV)v", + "^(S|U)ADDLVv", + "^(S|U)(MAX|MIN)Vv")>; +//========== +// SIMD pairwise arithmetic instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDPv", "^FADDPv", + "^(FMAX|FMIN)(NMP|P)v", + "^(S|U)(MIN|MAX)Pv")>; +//========== +// SIMD dot prodcut instructions +//========== + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^(U|S)DOTv")>; + +//========== +// SIMD table lookup instructions +//========== +// TBL 1-reg/2-reg; TBX 1-reg, 1uOp, throughput=4 latency=2 +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instrs TBLv8i8One, TBLv16i8One, + TBXv8i8One, TBXv16i8One, + TBLv8i8Two, TBLv16i8Two)>; + +// TBL 3-reg/4-reg, 3uops, throughtput=4/3=1.33 latency=4 +def : InstRW<[ORYONWrite_4Cyc_FP0123_FP0123_FP0123_RC], + (instrs TBLv8i8Three, TBLv16i8Three, + TBLv8i8Four, TBLv16i8Four)>; + + +// TBX 2-reg 2 uOps, throughput=2 latency=4 +def : InstRW<[ORYONWrite_4Cyc_FP0123_FP0123_RC], (instrs TBXv8i8Two, TBXv16i8Two)>; + +// TBX 3-reg/4-reg, 4uOps, throughput=1, latency=6 +def : InstRW<[ORYONWrite_6Cyc_FP0123_FP0123_FP0123_FP0123_RC], + (instrs TBXv8i8Three, TBXv16i8Three, + TBXv8i8Four, TBXv16i8Four)>; + + +//========== +// SIMD complex number arithmetic instructions +//========== + +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FCADDv", "^FCMLAv")>; + +//========== +// SIMD cryptographic instructions +//========== +// 3,4 on IMLA, CRYP +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^AES[DE]", + "^SM3(TT1|TT2)(A|B)")>; + +// 2,4 on CRYP +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^AESI?MC", + "^EOR3", + "^RAX1", + "^XAR", + "^BCAX", + "^SM3SS1", + "^SM3PART(W1|W2)")>; +// 5,1 on CRYP +def : InstRW<[ORYONWrite_5Cyc_FP1], (instregex "^SM4E", + "^SM4EKEY")>; + +// 2,1 on CRYP +def : InstRW<[ORYONWrite_2Cyc_FP1], (instregex "^SHA1(H|SU0|SU1)", + "^SHA256SU0", + "^SHA512(SU0|SU1)")>; + +// 3,1 on CRYP +def : InstRW<[ORYONWrite_3Cyc_FP1], (instregex "^SHA256SU1", + "^SHA512(H|H2)")>; + +// 4,0.25 on CRYP +def : InstRW<[ORYONWrite_4Cyc_FP1_RC4], (instregex "^SHA1(C|P|M)", + "^SHA256(H|H2)")>; + +//========== +// SIMD v8.6 instructions +//========== +// 4,2 on IMLA +def : InstRW<[ORYONWrite_4Cyc_FP0123_RC], (instregex "^(S|U|US)MMLA$")>; + +// 4,0.5 on IMLA +def : InstRW<[ORYONWrite_8Cyc_FP0123_RC], (instregex "^BFMMLA$")>; + +// 4,0.5 on IMLA +def : InstRW<[ORYONWrite_8Cyc_FP0123_RC], (instregex "^BFMLAL(B|T)")>; + +// 3,4 +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^(US|SU)DOTv")>; + +// 3,1 +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^BF(16)?DOTv")>; + +// 3,4 +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^BFCVT(N|N2)?$")>; + + +} // SchedModel = OryonModel diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 8bc26eeef34d..93ea729e2550 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -299,6 +299,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { PrefLoopAlignment = Align(64); MaxInterleaveFactor = 4; break; + case Oryon: + CacheLineSize = 64; + PrefFunctionAlignment = Align(16); + MaxInterleaveFactor = 4; + PrefetchDistance = 128; + MinPrefetchStride = 1024; + break; } if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index f49c73dc7951..9f5756fc7e40 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -58,6 +58,9 @@ static cl::opt<unsigned> InlineCallPenaltyChangeSM( static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden); +static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", + cl::init(true), cl::Hidden); + namespace { class TailFoldingOption { // These bitfields will only ever be set to something non-zero in operator=, @@ -4216,3 +4219,19 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) { return true; return BaseT::shouldTreatInstructionLikeSelect(I); } + +bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, + const TargetTransformInfo::LSRCost &C2) { + // AArch64 specific here is adding the number of instructions to the + // comparison (though not as the first consideration, as some targets do) + // along with changing the priority of the base additions. + // TODO: Maybe a more nuanced tradeoff between instruction count + // and number of registers? To be investigated at a later date. + if (EnableLSRCostOpt) + return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost, + C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost, + C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost); + + return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); +}
\ No newline at end of file diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 2f44aaa3e26a..feec1a4289c3 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -425,6 +425,9 @@ public: } std::optional<unsigned> getMinPageSize() const { return 4096; } + + bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, + const TargetTransformInfo::LSRCost &C2); }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8e302786c746..d0d7a9dc1724 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1534,6 +1534,12 @@ def FeatureISAVersion11_5_1 : FeatureSet< FeatureVGPRSingleUseHintInsts, Feature1_5xVGPRs])>; +def FeatureISAVersion11_5_2 : FeatureSet< + !listconcat(FeatureISAVersion11_Common.Features, + [FeatureSALUFloatInsts, + FeatureDPPSrc1SGPR, + FeatureVGPRSingleUseHintInsts])>; + def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, FeatureLDSBankCount32, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 625ac0230f16..2bdbf4151dd9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -1017,7 +1017,7 @@ public: // // TODO: We could filter out subgraphs that do not access LDS globals. for (Function *F : KernelsThatAllocateTableLDS) - removeFnAttrFromReachable(CG, F, "amdgpu-no-lds-kernel-id"); + removeFnAttrFromReachable(CG, F, {"amdgpu-no-lds-kernel-id"}); } DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 9c94ca1e4708..17c961578382 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -57,6 +57,7 @@ #include "llvm/Transforms/HipStdPar/HipStdPar.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/IPO/ExpandVariadics.h" #include "llvm/Transforms/IPO/GlobalDCE.h" #include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/Scalar.h" @@ -992,6 +993,10 @@ void AMDGPUPassConfig::addIRPasses() { if (isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); + // This can be disabled by passing ::Disable here or on the command line + // with --expand-variadics-override=disable. + addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 2ada981a77cd..d218ffeb1fec 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -295,7 +295,11 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel, FeatureISAVersion11_5_1.Features >; -// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151] +def : ProcessorModel<"gfx1152", GFX11SpeedModel, + FeatureISAVersion11_5_2.Features +>; + +// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152] def : ProcessorModel<"gfx11-generic", GFX11SpeedModel, FeatureISAVersion11_Generic.Features >; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index d7d6e00d2389..e805e964ffe4 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -113,6 +113,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: AK = GK_GFX1150; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: AK = GK_GFX1152; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: AK = GK_GFX9_GENERIC; break; @@ -196,6 +197,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103; case GK_GFX1150: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150; case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151; + case GK_GFX1152: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152; case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200; case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201; case GK_GFX9_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC; diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index c47eea20563d..8b42d4a1dee7 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2052,9 +2052,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList) const { - if (!(MI.mayLoad() ^ MI.mayStore())) - return false; - if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) return false; @@ -2065,10 +2062,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS : AMDGPUAS::FLAT_ADDRESS; - if (MI.mayLoad() && - TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) - return false; - if (AnchorList.count(&MI)) return false; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index f178324dbbe2..5dc3457b5bfa 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -103,8 +103,6 @@ private: MachineBasicBlock *emitEndCf(MachineInstr &MI); - void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI); - void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl<MachineOperand> &Src) const; @@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { return SplitBB; } -void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB, - MachineInstr &MI) { - MachineFunction &MF = *MBB->getParent(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - bool IsWave32 = ST.isWave32(); - - if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { - // This should be before all vector instructions. - MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), - TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec) - .addImm(MI.getOperand(0).getImm()); - if (LIS) { - LIS->RemoveMachineInstrFromMaps(MI); - LIS->InsertMachineInstrInMaps(*InitMI); - } - MI.eraseFromParent(); - return; - } - - // Extract the thread count from an SGPR input and set EXEC accordingly. - // Since BFM can't shift by 64, handle that case with CMP + CMOV. - // - // S_BFE_U32 count, input, {shift, 7} - // S_BFM_B64 exec, count, 0 - // S_CMP_EQ_U32 count, 64 - // S_CMOV_B64 exec, -1 - Register InputReg = MI.getOperand(0).getReg(); - MachineInstr *FirstMI = &*MBB->begin(); - if (InputReg.isVirtual()) { - MachineInstr *DefInstr = MRI->getVRegDef(InputReg); - assert(DefInstr && DefInstr->isCopy()); - if (DefInstr->getParent() == MBB) { - if (DefInstr != FirstMI) { - // If the `InputReg` is defined in current block, we also need to - // move that instruction to the beginning of the block. - DefInstr->removeFromParent(); - MBB->insert(FirstMI, DefInstr); - if (LIS) - LIS->handleMove(*DefInstr); - } else { - // If first instruction is definition then move pointer after it. - FirstMI = &*std::next(FirstMI->getIterator()); - } - } - } - - // Insert instruction sequence at block beginning (before vector operations). - const DebugLoc DL = MI.getDebugLoc(); - const unsigned WavefrontSize = ST.getWavefrontSize(); - const unsigned Mask = (WavefrontSize << 1) - 1; - Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); - auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) - .addReg(InputReg) - .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); - if (LV) - LV->recomputeForSingleDefVirtReg(InputReg); - auto BfmMI = - BuildMI(*MBB, FirstMI, DL, - TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) - .addReg(CountReg) - .addImm(0); - auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) - .addReg(CountReg, RegState::Kill) - .addImm(WavefrontSize); - if (LV) - LV->getVarInfo(CountReg).Kills.push_back(CmpMI); - auto CmovMI = - BuildMI(*MBB, FirstMI, DL, - TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), - Exec) - .addImm(-1); - - if (!LIS) { - MI.eraseFromParent(); - return; - } - - LIS->RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); - - LIS->InsertMachineInstrInMaps(*BfeMI); - LIS->InsertMachineInstrInMaps(*BfmMI); - LIS->InsertMachineInstrInMaps(*CmpMI); - LIS->InsertMachineInstrInMaps(*CmovMI); - - RecomputeRegs.insert(InputReg); - LIS->createAndComputeVirtRegInterval(CountReg); -} - bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { for (auto &I : MBB.instrs()) { if (!I.isDebugInstr() && !I.isUnconditionalBranch()) @@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { SplitMBB = process(MI); Changed = true; break; - - // FIXME: find a better place for this - case AMDGPU::SI_INIT_EXEC: - case AMDGPU::SI_INIT_EXEC_FROM_INPUT: - lowerInitExec(MBB, MI); - if (LIS) - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); - Changed = true; - break; - - default: - break; } if (SplitMBB != MBB) { diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 09dc1c781e2f..5b4c44302fa6 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -177,6 +177,7 @@ private: SmallVector<MachineInstr *, 4> LowerToMovInstrs; SmallVector<MachineInstr *, 4> LowerToCopyInstrs; SmallVector<MachineInstr *, 4> KillInstrs; + SmallVector<MachineInstr *, 4> InitExecInstrs; void printInfo(); @@ -223,6 +224,8 @@ private: void lowerLiveMaskQueries(); void lowerCopyInstrs(); void lowerKillInstrs(bool IsWQM); + void lowerInitExec(MachineInstr &MI); + void lowerInitExecInstrs(); public: static char ID; @@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, Opcode == AMDGPU::SI_DEMOTE_I1) { KillInstrs.push_back(&MI); BBI.NeedsLowering = true; + } else if (Opcode == AMDGPU::SI_INIT_EXEC || + Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) { + InitExecInstrs.push_back(&MI); } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are @@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { } } +void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getParent(); + bool IsWave32 = ST->isWave32(); + + if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { + // This should be before all vector instructions. + MachineInstr *InitMI = + BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), + TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), + Exec) + .addImm(MI.getOperand(0).getImm()); + if (LIS) { + LIS->RemoveMachineInstrFromMaps(MI); + LIS->InsertMachineInstrInMaps(*InitMI); + } + MI.eraseFromParent(); + return; + } + + // Extract the thread count from an SGPR input and set EXEC accordingly. + // Since BFM can't shift by 64, handle that case with CMP + CMOV. + // + // S_BFE_U32 count, input, {shift, 7} + // S_BFM_B64 exec, count, 0 + // S_CMP_EQ_U32 count, 64 + // S_CMOV_B64 exec, -1 + Register InputReg = MI.getOperand(0).getReg(); + MachineInstr *FirstMI = &*MBB->begin(); + if (InputReg.isVirtual()) { + MachineInstr *DefInstr = MRI->getVRegDef(InputReg); + assert(DefInstr && DefInstr->isCopy()); + if (DefInstr->getParent() == MBB) { + if (DefInstr != FirstMI) { + // If the `InputReg` is defined in current block, we also need to + // move that instruction to the beginning of the block. + DefInstr->removeFromParent(); + MBB->insert(FirstMI, DefInstr); + if (LIS) + LIS->handleMove(*DefInstr); + } else { + // If first instruction is definition then move pointer after it. + FirstMI = &*std::next(FirstMI->getIterator()); + } + } + } + + // Insert instruction sequence at block beginning (before vector operations). + const DebugLoc DL = MI.getDebugLoc(); + const unsigned WavefrontSize = ST->getWavefrontSize(); + const unsigned Mask = (WavefrontSize << 1) - 1; + Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) + .addReg(InputReg) + .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); + auto BfmMI = + BuildMI(*MBB, FirstMI, DL, + TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) + .addReg(CountReg) + .addImm(0); + auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) + .addReg(CountReg, RegState::Kill) + .addImm(WavefrontSize); + auto CmovMI = + BuildMI(*MBB, FirstMI, DL, + TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), + Exec) + .addImm(-1); + + if (!LIS) { + MI.eraseFromParent(); + return; + } + + LIS->RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + + LIS->InsertMachineInstrInMaps(*BfeMI); + LIS->InsertMachineInstrInMaps(*BfmMI); + LIS->InsertMachineInstrInMaps(*CmpMI); + LIS->InsertMachineInstrInMaps(*CmovMI); + + LIS->removeInterval(InputReg); + LIS->createAndComputeVirtRegInterval(InputReg); + LIS->createAndComputeVirtRegInterval(CountReg); +} + +void SIWholeQuadMode::lowerInitExecInstrs() { + for (MachineInstr *MI : InitExecInstrs) + lowerInitExec(*MI); +} + bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() << " ------------- \n"); @@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LowerToCopyInstrs.clear(); LowerToMovInstrs.clear(); KillInstrs.clear(); + InitExecInstrs.clear(); StateTransition.clear(); ST = &MF.getSubtarget<GCNSubtarget>(); @@ -1606,10 +1704,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { // Shader is simple does not need any state changes or any complex lowering if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty() && KillInstrs.empty()) { + lowerInitExecInstrs(); lowerLiveMaskQueries(); - return !LiveMaskQueries.empty(); + return !InitExecInstrs.empty() || !LiveMaskQueries.empty(); } + lowerInitExecInstrs(); + MachineBasicBlock &Entry = MF.front(); MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp index 239e0ee70572..04c6e940e6ed 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -235,8 +235,9 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { } void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, - StringRef FnAttr) { - KernelRoot->removeFnAttr(FnAttr); + ArrayRef<StringRef> FnAttrs) { + for (StringRef Attr : FnAttrs) + KernelRoot->removeFnAttr(Attr); SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()}; SmallPtrSet<Function *, 8> Visited; @@ -261,12 +262,15 @@ void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, Function *PotentialCallee = ExternalCallRecord.second->getFunction(); assert(PotentialCallee); - if (!isKernelLDS(PotentialCallee)) - PotentialCallee->removeFnAttr(FnAttr); + if (!isKernelLDS(PotentialCallee)) { + for (StringRef Attr : FnAttrs) + PotentialCallee->removeFnAttr(Attr); + } } } } else { - Callee->removeFnAttr(FnAttr); + for (StringRef Attr : FnAttrs) + Callee->removeFnAttr(Attr); if (Visited.insert(Callee).second) WorkList.push_back(Callee); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h index 4d3ad328e131..e1cd4d03052b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -54,7 +55,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M); /// Strip FnAttr attribute from any functions where we may have /// introduced its use. void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, - StringRef FnAttr); + ArrayRef<StringRef> FnAttrs); /// Given a \p Def clobbering a load from \p Ptr according to the MSSA check /// if this is actually a memory update or an artificial clobber to facilitate diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index a46c383115e2..919828753f45 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -115,6 +115,12 @@ static bool shouldInspect(MachineInstr &MI) { return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI); } +static bool isHorizontalReduction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + uint64_t Flags = MCID.TSFlags; + return (Flags & ARMII::HorizontalReduction) != 0; +} + namespace { using InstSet = SmallPtrSetImpl<MachineInstr *>; @@ -275,6 +281,16 @@ namespace { if (VPT->getOpcode() == ARM::MVE_VPST) return false; + // If the VPT block does not define something that is an "output", then + // the tail-predicated version will just perform a subset of the original + // vpt block, where the last lanes should not be used. + if (isVPTOpcode(VPT->getOpcode()) && + all_of(Block.getInsts(), [](const MachineInstr *MI) { + return !MI->mayStore() && !MI->mayLoad() && + !isHorizontalReduction(*MI) && !isVCTP(MI); + })) + return true; + auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx)); return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op); @@ -813,12 +829,6 @@ static bool producesDoubleWidthResult(const MachineInstr &MI) { return (Flags & ARMII::DoubleWidthResult) != 0; } -static bool isHorizontalReduction(const MachineInstr &MI) { - const MCInstrDesc &MCID = MI.getDesc(); - uint64_t Flags = MCID.TSFlags; - return (Flags & ARMII::HorizontalReduction) != 0; -} - // Can this instruction generate a non-zero result when given only zeroed // operands? This allows us to know that, given operands with false bytes // zeroed by masked loads, that the result will also contain zeros in those diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 51384f25d245..9d7e4636abac 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -171,6 +171,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, // Set operations for 'F' feature. if (Subtarget.hasBasicF()) { + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); setCondCodeAction(FPCCToExpand, MVT::f32, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); @@ -186,6 +188,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f32, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::FRINT, MVT::f32, Legal); @@ -202,7 +206,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, // Set operations for 'D' feature. if (Subtarget.hasBasicD()) { + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setCondCodeAction(FPCCToExpand, MVT::f64, Expand); @@ -219,6 +225,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::FRINT, MVT::f64, Legal); @@ -5004,6 +5012,10 @@ bool LoongArchTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64; } +bool LoongArchTargetLowering::signExtendConstant(const ConstantInt *CI) const { + return Subtarget.is64Bit() && CI->getType()->isIntegerTy(32); +} + bool LoongArchTargetLowering::hasAndNotCompare(SDValue Y) const { // TODO: Support vectors. if (Y.getValueType().isVector()) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index f274b1971fd2..9328831a17a3 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -229,6 +229,7 @@ public: bool isLegalAddImmediate(int64_t Imm) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override; + bool signExtendConstant(const ConstantInt *CI) const override; bool hasAndNotCompare(SDValue Y) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 83466d53f84d..c29c1b593321 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -46,7 +46,7 @@ static cl::opt<bool> static std::string computeDataLayout(const Triple &TT) { if (TT.isArch64Bit()) - return "e-m:e-p:64:64-i64:64-i128:128-n64-S128"; + return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"; assert(TT.isArch32Bit() && "only LA32 and LA64 are currently supported"); return "e-m:e-p:32:32-i64:64-n32-S128"; } diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 5eefab59a6ab..b0cb24c63c3c 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -40,7 +40,7 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM, ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMLegacyPass(); ModulePass *createNVPTXCtorDtorLoweringLegacyPass(); -FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); +FunctionPass *createNVVMIntrRangePass(); FunctionPass *createNVVMReflectPass(unsigned int SmVersion); MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); @@ -53,12 +53,7 @@ MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> { - NVVMIntrRangePass(); - NVVMIntrRangePass(unsigned SmVersion) : SmVersion(SmVersion) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); - -private: - unsigned SmVersion; }; struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> { diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index f63697916d90..82770f866085 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -542,30 +542,24 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // If the NVVM IR has some of reqntid* specified, then output // the reqntid directive, and set the unspecified ones to 1. // If none of Reqntid* is specified, don't output reqntid directive. - unsigned Reqntidx, Reqntidy, Reqntidz; - Reqntidx = Reqntidy = Reqntidz = 1; - bool ReqSpecified = false; - ReqSpecified |= getReqNTIDx(F, Reqntidx); - ReqSpecified |= getReqNTIDy(F, Reqntidy); - ReqSpecified |= getReqNTIDz(F, Reqntidz); + std::optional<unsigned> Reqntidx = getReqNTIDx(F); + std::optional<unsigned> Reqntidy = getReqNTIDy(F); + std::optional<unsigned> Reqntidz = getReqNTIDz(F); - if (ReqSpecified) - O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz - << "\n"; + if (Reqntidx || Reqntidy || Reqntidz) + O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1) + << ", " << Reqntidz.value_or(1) << "\n"; // If the NVVM IR has some of maxntid* specified, then output // the maxntid directive, and set the unspecified ones to 1. // If none of maxntid* is specified, don't output maxntid directive. - unsigned Maxntidx, Maxntidy, Maxntidz; - Maxntidx = Maxntidy = Maxntidz = 1; - bool MaxSpecified = false; - MaxSpecified |= getMaxNTIDx(F, Maxntidx); - MaxSpecified |= getMaxNTIDy(F, Maxntidy); - MaxSpecified |= getMaxNTIDz(F, Maxntidz); - - if (MaxSpecified) - O << ".maxntid " << Maxntidx << ", " << Maxntidy << ", " << Maxntidz - << "\n"; + std::optional<unsigned> Maxntidx = getMaxNTIDx(F); + std::optional<unsigned> Maxntidy = getMaxNTIDy(F); + std::optional<unsigned> Maxntidz = getMaxNTIDz(F); + + if (Maxntidx || Maxntidy || Maxntidz) + O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1) + << ", " << Maxntidz.value_or(1) << "\n"; unsigned Mincta = 0; if (getMinCTASm(F, Mincta)) diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 4dc3cea4bd8e..b60a1d747af7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -233,9 +233,9 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks( [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); - // FIXME: NVVMIntrRangePass is causing numerical discrepancies, - // investigate and re-enable. - // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); + // Note: NVVMIntrRangePass was causing numerical discrepancies at one + // point, if issues crop up, consider disabling. + FPM.addPass(NVVMIntrRangePass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 013afe916e86..3a536db1c972 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -128,6 +128,14 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, return true; } +static std::optional<unsigned> +findOneNVVMAnnotation(const GlobalValue &GV, const std::string &PropName) { + unsigned RetVal; + if (findOneNVVMAnnotation(&GV, PropName, RetVal)) + return RetVal; + return std::nullopt; +} + bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, std::vector<unsigned> &retval) { auto &AC = getAnnotationCache(); @@ -252,32 +260,57 @@ std::string getSamplerName(const Value &val) { return std::string(val.getName()); } -bool getMaxNTIDx(const Function &F, unsigned &x) { - return findOneNVVMAnnotation(&F, "maxntidx", x); +std::optional<unsigned> getMaxNTIDx(const Function &F) { + return findOneNVVMAnnotation(F, "maxntidx"); } -bool getMaxNTIDy(const Function &F, unsigned &y) { - return findOneNVVMAnnotation(&F, "maxntidy", y); +std::optional<unsigned> getMaxNTIDy(const Function &F) { + return findOneNVVMAnnotation(F, "maxntidy"); } -bool getMaxNTIDz(const Function &F, unsigned &z) { - return findOneNVVMAnnotation(&F, "maxntidz", z); +std::optional<unsigned> getMaxNTIDz(const Function &F) { + return findOneNVVMAnnotation(F, "maxntidz"); +} + +std::optional<unsigned> getMaxNTID(const Function &F) { + // Note: The semantics here are a bit strange. The PTX ISA states the + // following (11.4.2. Performance-Tuning Directives: .maxntid): + // + // Note that this directive guarantees that the total number of threads does + // not exceed the maximum, but does not guarantee that the limit in any + // particular dimension is not exceeded. + std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F); + std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F); + std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F); + if (MaxNTIDx || MaxNTIDy || MaxNTIDz) + return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1); + return std::nullopt; } bool getMaxClusterRank(const Function &F, unsigned &x) { return findOneNVVMAnnotation(&F, "maxclusterrank", x); } -bool getReqNTIDx(const Function &F, unsigned &x) { - return findOneNVVMAnnotation(&F, "reqntidx", x); +std::optional<unsigned> getReqNTIDx(const Function &F) { + return findOneNVVMAnnotation(F, "reqntidx"); +} + +std::optional<unsigned> getReqNTIDy(const Function &F) { + return findOneNVVMAnnotation(F, "reqntidy"); } -bool getReqNTIDy(const Function &F, unsigned &y) { - return findOneNVVMAnnotation(&F, "reqntidy", y); +std::optional<unsigned> getReqNTIDz(const Function &F) { + return findOneNVVMAnnotation(F, "reqntidz"); } -bool getReqNTIDz(const Function &F, unsigned &z) { - return findOneNVVMAnnotation(&F, "reqntidz", z); +std::optional<unsigned> getReqNTID(const Function &F) { + // Note: The semantics here are a bit strange. See getMaxNTID. + std::optional<unsigned> ReqNTIDx = getReqNTIDx(F); + std::optional<unsigned> ReqNTIDy = getReqNTIDy(F); + std::optional<unsigned> ReqNTIDz = getReqNTIDz(F); + if (ReqNTIDx || ReqNTIDy || ReqNTIDz) + return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1); + return std::nullopt; } bool getMinCTASm(const Function &F, unsigned &x) { diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 2872db9fa213..e020bc0f02e9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -48,13 +48,15 @@ std::string getTextureName(const Value &); std::string getSurfaceName(const Value &); std::string getSamplerName(const Value &); -bool getMaxNTIDx(const Function &, unsigned &); -bool getMaxNTIDy(const Function &, unsigned &); -bool getMaxNTIDz(const Function &, unsigned &); - -bool getReqNTIDx(const Function &, unsigned &); -bool getReqNTIDy(const Function &, unsigned &); -bool getReqNTIDz(const Function &, unsigned &); +std::optional<unsigned> getMaxNTIDx(const Function &); +std::optional<unsigned> getMaxNTIDy(const Function &); +std::optional<unsigned> getMaxNTIDz(const Function &); +std::optional<unsigned> getMaxNTID(const Function &F); + +std::optional<unsigned> getReqNTIDx(const Function &); +std::optional<unsigned> getReqNTIDy(const Function &); +std::optional<unsigned> getReqNTIDz(const Function &); +std::optional<unsigned> getReqNTID(const Function &); bool getMaxClusterRank(const Function &, unsigned &); bool getMinCTASm(const Function &, unsigned &); diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp index 5381646434eb..f9d21b38a7ec 100644 --- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp +++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp @@ -1,4 +1,4 @@ -//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===// +//===- NVVMIntrRange.cpp - Set range attributes for NVVM intrinsics -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,19 +6,21 @@ // //===----------------------------------------------------------------------===// // -// This pass adds appropriate !range metadata for calls to NVVM +// This pass adds appropriate range attributes for calls to NVVM // intrinsics that return a limited range of values. // //===----------------------------------------------------------------------===// #include "NVPTX.h" -#include "llvm/IR/Constants.h" +#include "NVPTXUtilities.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/CommandLine.h" +#include <cstdint> using namespace llvm; @@ -26,31 +28,20 @@ using namespace llvm; namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); } -// Add !range metadata based on limits of given SM variant. -static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20), - cl::Hidden, cl::desc("SM variant")); - namespace { class NVVMIntrRange : public FunctionPass { - private: - unsigned SmVersion; - - public: - static char ID; - NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {} - NVVMIntrRange(unsigned int SmVersion) - : FunctionPass(ID), SmVersion(SmVersion) { +public: + static char ID; + NVVMIntrRange() : FunctionPass(ID) { - initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); - } + initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &) override; + bool runOnFunction(Function &) override; }; -} +} // namespace -FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) { - return new NVVMIntrRange(SmVersion); -} +FunctionPass *llvm::createNVVMIntrRangePass() { return new NVVMIntrRange(); } char NVVMIntrRange::ID = 0; INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range", @@ -58,112 +49,110 @@ INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range", // Adds the passed-in [Low,High) range information as metadata to the // passed-in call instruction. -static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) { - // This call already has range metadata, nothing to do. - if (C->getMetadata(LLVMContext::MD_range)) +static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) { + if (II->getMetadata(LLVMContext::MD_range)) return false; - LLVMContext &Context = C->getParent()->getContext(); - IntegerType *Int32Ty = Type::getInt32Ty(Context); - Metadata *LowAndHigh[] = { - ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)), - ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))}; - C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh)); + const uint64_t BitWidth = II->getType()->getIntegerBitWidth(); + ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High)); + + if (auto CurrentRange = II->getRange()) + Range = Range.intersectWith(CurrentRange.value()); + + II->addRangeRetAttr(Range); return true; } -static bool runNVVMIntrRange(Function &F, unsigned SmVersion) { +static bool runNVVMIntrRange(Function &F) { struct { unsigned x, y, z; } MaxBlockSize, MaxGridSize; - MaxBlockSize.x = 1024; - MaxBlockSize.y = 1024; - MaxBlockSize.z = 64; - MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff; + const unsigned MetadataNTID = getReqNTID(F).value_or( + getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max())); + + MaxBlockSize.x = std::min(1024u, MetadataNTID); + MaxBlockSize.y = std::min(1024u, MetadataNTID); + MaxBlockSize.z = std::min(64u, MetadataNTID); + + MaxGridSize.x = 0x7fffffff; MaxGridSize.y = 0xffff; MaxGridSize.z = 0xffff; // Go through the calls in this function. bool Changed = false; for (Instruction &I : instructions(F)) { - CallInst *Call = dyn_cast<CallInst>(&I); - if (!Call) + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); + if (!II) continue; - if (Function *Callee = Call->getCalledFunction()) { - switch (Callee->getIntrinsicID()) { - // Index within block - case Intrinsic::nvvm_read_ptx_sreg_tid_x: - Changed |= addRangeMetadata(0, MaxBlockSize.x, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_tid_y: - Changed |= addRangeMetadata(0, MaxBlockSize.y, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_tid_z: - Changed |= addRangeMetadata(0, MaxBlockSize.z, Call); - break; - - // Block size - case Intrinsic::nvvm_read_ptx_sreg_ntid_x: - Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ntid_y: - Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ntid_z: - Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call); - break; - - // Index within grid - case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: - Changed |= addRangeMetadata(0, MaxGridSize.x, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: - Changed |= addRangeMetadata(0, MaxGridSize.y, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: - Changed |= addRangeMetadata(0, MaxGridSize.z, Call); - break; - - // Grid size - case Intrinsic::nvvm_read_ptx_sreg_nctaid_x: - Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_nctaid_y: - Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_nctaid_z: - Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call); - break; - - // warp size is constant 32. - case Intrinsic::nvvm_read_ptx_sreg_warpsize: - Changed |= addRangeMetadata(32, 32+1, Call); - break; - - // Lane ID is [0..warpsize) - case Intrinsic::nvvm_read_ptx_sreg_laneid: - Changed |= addRangeMetadata(0, 32, Call); - break; - - default: - break; - } + switch (II->getIntrinsicID()) { + // Index within block + case Intrinsic::nvvm_read_ptx_sreg_tid_x: + Changed |= addRangeAttr(0, MaxBlockSize.x, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_tid_y: + Changed |= addRangeAttr(0, MaxBlockSize.y, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_tid_z: + Changed |= addRangeAttr(0, MaxBlockSize.z, II); + break; + + // Block size + case Intrinsic::nvvm_read_ptx_sreg_ntid_x: + Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ntid_y: + Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ntid_z: + Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II); + break; + + // Index within grid + case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: + Changed |= addRangeAttr(0, MaxGridSize.x, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: + Changed |= addRangeAttr(0, MaxGridSize.y, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: + Changed |= addRangeAttr(0, MaxGridSize.z, II); + break; + + // Grid size + case Intrinsic::nvvm_read_ptx_sreg_nctaid_x: + Changed |= addRangeAttr(1, MaxGridSize.x + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_nctaid_y: + Changed |= addRangeAttr(1, MaxGridSize.y + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_nctaid_z: + Changed |= addRangeAttr(1, MaxGridSize.z + 1, II); + break; + + // warp size is constant 32. + case Intrinsic::nvvm_read_ptx_sreg_warpsize: + Changed |= addRangeAttr(32, 32 + 1, II); + break; + + // Lane ID is [0..warpsize) + case Intrinsic::nvvm_read_ptx_sreg_laneid: + Changed |= addRangeAttr(0, 32, II); + break; + + default: + break; } } return Changed; } -bool NVVMIntrRange::runOnFunction(Function &F) { - return runNVVMIntrRange(F, SmVersion); -} - -NVVMIntrRangePass::NVVMIntrRangePass() : NVVMIntrRangePass(NVVMIntrRangeSM) {} +bool NVVMIntrRange::runOnFunction(Function &F) { return runNVVMIntrRange(F); } PreservedAnalyses NVVMIntrRangePass::run(Function &F, FunctionAnalysisManager &AM) { - return runNVVMIntrRange(F, SmVersion) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); + return runNVVMIntrRange(F) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index f4e84ade3b5a..bc0ae7a32c05 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1079,13 +1079,13 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { assert(IsAIX && TM.getCodeModel() == CodeModel::Small && "PseudoOp only valid for small code model AIX"); - // Transform %rN = ADDItoc/8 @op1, %r2. + // Transform %rN = ADDItoc/8 %r2, @op1. LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); // Change the opcode to load address. TmpInst.setOpcode((!IsPPC64) ? (PPC::LA) : (PPC::LA8)); - const MachineOperand &MO = MI->getOperand(1); + const MachineOperand &MO = MI->getOperand(2); assert(MO.isGlobal() && "Invalid operand for ADDItoc[8]."); // Map the operand to its corresponding MCSymbol. @@ -1094,7 +1094,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { const MCExpr *Exp = MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_None, OutContext); - TmpInst.getOperand(1) = TmpInst.getOperand(2); TmpInst.getOperand(2) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 735050641adf..a07954bd0d8b 100644 --- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -2080,13 +2080,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { cast<GlobalVariable>(GV)->hasAttribute("toc-data"); // For small code model, generate a simple TOC load. - if (CModel == CodeModel::Small) - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, - IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc), - DestReg) - .addGlobalAddress(GV) - .addReg(PPC::X2); - else { + if (CModel == CodeModel::Small) { + auto MIB = BuildMI( + *FuncInfo.MBB, FuncInfo.InsertPt, MIMD, + IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc), DestReg); + if (IsAIXTocData) + MIB.addReg(PPC::X2).addGlobalAddress(GV); + else + MIB.addGlobalAddress(GV).addReg(PPC::X2); + } else { // If the address is an externally defined symbol, a symbol with common // or externally available linkage, a non-local function address, or a // jump table address (not yet needed), or if we are generating code diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 275b3337a276..1a69d1e89313 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -6102,8 +6102,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) { EVT OperandTy) { SDValue GA = TocEntry->getOperand(0); SDValue TocBase = TocEntry->getOperand(1); - SDNode *MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase); - transferMemOperands(TocEntry, MN); + SDNode *MN = nullptr; + if (OpCode == PPC::ADDItoc || OpCode == PPC::ADDItoc8) + // toc-data access doesn't involve in loading from got, no need to + // keep memory operands. + MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, TocBase, GA); + else { + MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase); + transferMemOperands(TocEntry, MN); + } ReplaceNode(TocEntry, MN); }; diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 9af8ada78376..eda5eb975e70 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1485,11 +1485,9 @@ def ADDItocL8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry: } // Local Data Transform -def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), +def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), "#ADDItoc8", - [(set i64:$rD, - (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64; - + []>, isPPC64; let mayLoad = 1 in def LDtocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), "#LDtocL", []>, isPPC64; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index df6b2bf1a7b7..09f829943528 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -3345,10 +3345,8 @@ def LWZtocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc_nor def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp), "#ADDIStocHA", []>; // TOC Data Transform on AIX -def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg), - "#ADDItoc", - [(set i32:$rD, - (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>; +def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$reg, tocentry32:$disp), + "#ADDItoc", []>; def ADDItocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp), "#ADDItocL", []>; diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index a96768240a93..82358cdd45ed 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -932,11 +932,11 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const { "Can't handle X0, X0 vsetvli yet"); if (AVLReg == RISCV::X0) NewInfo.setAVLVLMAX(); - else if (VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS)) - NewInfo.setAVLRegDef(VNI, AVLReg); - else { - assert(MI.getOperand(1).isUndef()); + else if (MI.getOperand(1).isUndef()) NewInfo.setAVLIgnored(); + else { + VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS); + NewInfo.setAVLRegDef(VNI, AVLReg); } } NewInfo.setVTYPE(MI.getOperand(2).getImm()); @@ -1008,11 +1008,11 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const { } else InstrInfo.setAVLImm(Imm); - } else if (VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS)) { - InstrInfo.setAVLRegDef(VNI, VLOp.getReg()); - } else { - assert(VLOp.isUndef()); + } else if (VLOp.isUndef()) { InstrInfo.setAVLIgnored(); + } else { + VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS); + InstrInfo.setAVLRegDef(VNI, VLOp.getReg()); } } else { assert(isScalarExtractInstr(MI)); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 6d926ce551e0..b0949f5fc1d7 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -1033,6 +1033,22 @@ class VPseudoUnaryNoMask<DAGOperand RetClass, let HasVecPolicyOp = 1; } +class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass, + DAGOperand OpClass, + string Constraint = "", + int TargetConstraintType = 1> : + Pseudo<(outs RetClass:$rd), + (ins OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>, + RISCVVPseudo { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let Constraints = Constraint; + let TargetOverlapConstraintType = TargetConstraintType; + let HasVLOp = 1; + let HasSEWOp = 1; +} + class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass, DAGOperand OpClass, string Constraint = "", @@ -1422,24 +1438,6 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass, let UsesVXRM = 0; } -// Like VPseudoBinaryNoMask, but output can be V0. -class VPseudoBinaryMOutNoMask<VReg RetClass, - VReg Op1Class, - DAGOperand Op2Class, - string Constraint, - int TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>, - RISCVVPseudo { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let Constraints = Constraint; - let TargetOverlapConstraintType = TargetConstraintType; - let HasVLOp = 1; - let HasSEWOp = 1; -} - // Like VPseudoBinaryMask, but output can be V0. class VPseudoBinaryMOutMask<VReg RetClass, RegisterClass Op1Class, @@ -2056,9 +2054,10 @@ multiclass VPseudoVSFS_M { foreach mti = AllMasks in { defvar mx = mti.LMul.MX; let VLMul = mti.LMul.value in { - def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>, + def "_M_" # mti.BX : VPseudoUnaryNoMaskNoPolicy<VR, VR, constraint>, SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx, forceMergeOpRead=true>; + let ForceTailAgnostic = true in def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>, SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx, forceMergeOpRead=true>; @@ -2172,8 +2171,8 @@ multiclass VPseudoBinaryM<VReg RetClass, int TargetConstraintType = 1, bit Commutable = 0> { let VLMul = MInfo.value, isCommutable = Commutable in { - def "_" # MInfo.MX : VPseudoBinaryMOutNoMask<RetClass, Op1Class, Op2Class, - Constraint, TargetConstraintType>; + def "_" # MInfo.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class, + Constraint, TargetConstraintType>; let ForceTailAgnostic = true in def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMOutMask<RetClass, Op1Class, Op2Class, Constraint, TargetConstraintType>, @@ -4078,9 +4077,8 @@ class VPatMaskUnaryNoMask<string intrinsic_name, (mti.Mask VR:$rs2), VLOpFrag)), (!cast<Instruction>(inst#"_M_"#mti.BX) - (mti.Mask (IMPLICIT_DEF)), (mti.Mask VR:$rs2), - GPR:$vl, mti.Log2SEW, TA_MA)>; + GPR:$vl, mti.Log2SEW)>; class VPatMaskUnaryMask<string intrinsic_name, string inst, @@ -4153,27 +4151,6 @@ class VPatBinaryNoMaskTU<string intrinsic_name, (op2_type op2_kind:$rs2), GPR:$vl, sew, TU_MU)>; -class VPatBinaryNoMaskRoundingMode<string intrinsic_name, - string inst, - ValueType result_type, - ValueType op1_type, - ValueType op2_type, - int sew, - VReg op1_reg_class, - DAGOperand op2_kind> : - Pat<(result_type (!cast<Intrinsic>(intrinsic_name) - (result_type (undef)), - (op1_type op1_reg_class:$rs1), - (op2_type op2_kind:$rs2), - (XLenVT timm:$round), - VLOpFrag)), - (!cast<Instruction>(inst) - (result_type (IMPLICIT_DEF)), - (op1_type op1_reg_class:$rs1), - (op2_type op2_kind:$rs2), - (XLenVT timm:$round), - GPR:$vl, sew, TA_MA)>; - class VPatBinaryNoMaskTURoundingMode<string intrinsic_name, string inst, ValueType result_type, @@ -4827,8 +4804,6 @@ multiclass VPatBinaryRoundingMode<string intrinsic, VReg result_reg_class, VReg op1_reg_class, DAGOperand op2_kind> { - def : VPatBinaryNoMaskRoundingMode<intrinsic, inst, result_type, op1_type, op2_type, - sew, op1_reg_class, op2_kind>; def : VPatBinaryNoMaskTURoundingMode<intrinsic, inst, result_type, op1_type, op2_type, sew, result_reg_class, op1_reg_class, op2_kind>; def : VPatBinaryMaskTARoundingMode<intrinsic, inst, result_type, op1_type, op2_type, @@ -6962,12 +6937,12 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors, foreach vti = AllIntegerVectors in { // Emit shift by 1 as an add since it might be faster. let Predicates = GetVTypePredicates<vti>.Predicates in { - def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector undef), + def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$merge), (vti.Vector vti.RegClass:$rs1), (XLenVT 1), VLOpFrag)), (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX) - (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, - vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TA_MA)>; + vti.RegClass:$merge, vti.RegClass:$rs1, + vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TU_MU)>; def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$merge), (vti.Vector vti.RegClass:$rs1), (XLenVT 1), diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 956b851fce6c..49838e685a6d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -1459,11 +1459,22 @@ static bool generateImageSizeQueryInst(const SPIRV::IncomingCall *Call, Component == 3 ? NumActualRetComponents - 1 : Component; assert(ExtractedComposite < NumActualRetComponents && "Invalid composite index!"); + Register TypeReg = GR->getSPIRVTypeID(Call->ReturnType); + SPIRVType *NewType = nullptr; + if (QueryResultType->getOpcode() == SPIRV::OpTypeVector) { + Register NewTypeReg = QueryResultType->getOperand(1).getReg(); + if (TypeReg != NewTypeReg && + (NewType = GR->getSPIRVTypeForVReg(NewTypeReg)) != nullptr) + TypeReg = NewTypeReg; + } MIRBuilder.buildInstr(SPIRV::OpCompositeExtract) .addDef(Call->ReturnRegister) - .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(TypeReg) .addUse(QueryResult) .addImm(ExtractedComposite); + if (NewType != nullptr) + insertAssignInstr(Call->ReturnRegister, nullptr, NewType, GR, MIRBuilder, + MIRBuilder.getMF().getRegInfo()); } else { // More than 1 component is expected, fill a new vector. auto MIB = MIRBuilder.buildInstr(SPIRV::OpVectorShuffle) @@ -2063,16 +2074,30 @@ static bool generateAsyncCopy(const SPIRV::IncomingCall *Call, auto Scope = buildConstantIntReg(SPIRV::Scope::Workgroup, MIRBuilder, GR); switch (Opcode) { - case SPIRV::OpGroupAsyncCopy: - return MIRBuilder.buildInstr(Opcode) - .addDef(Call->ReturnRegister) - .addUse(GR->getSPIRVTypeID(Call->ReturnType)) - .addUse(Scope) - .addUse(Call->Arguments[0]) - .addUse(Call->Arguments[1]) - .addUse(Call->Arguments[2]) - .addUse(buildConstantIntReg(1, MIRBuilder, GR)) - .addUse(Call->Arguments[3]); + case SPIRV::OpGroupAsyncCopy: { + SPIRVType *NewType = + Call->ReturnType->getOpcode() == SPIRV::OpTypeEvent + ? nullptr + : GR->getOrCreateSPIRVTypeByName("spirv.Event", MIRBuilder); + Register TypeReg = GR->getSPIRVTypeID(NewType ? NewType : Call->ReturnType); + unsigned NumArgs = Call->Arguments.size(); + Register EventReg = Call->Arguments[NumArgs - 1]; + bool Res = MIRBuilder.buildInstr(Opcode) + .addDef(Call->ReturnRegister) + .addUse(TypeReg) + .addUse(Scope) + .addUse(Call->Arguments[0]) + .addUse(Call->Arguments[1]) + .addUse(Call->Arguments[2]) + .addUse(Call->Arguments.size() > 4 + ? Call->Arguments[3] + : buildConstantIntReg(1, MIRBuilder, GR)) + .addUse(EventReg); + if (NewType != nullptr) + insertAssignInstr(Call->ReturnRegister, nullptr, NewType, GR, MIRBuilder, + MIRBuilder.getMF().getRegInfo()); + return Res; + } case SPIRV::OpGroupWaitEvents: return MIRBuilder.buildInstr(Opcode) .addUse(Scope) diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 24c6c2688642..edc9e1a33d9f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -586,6 +586,7 @@ defm : DemangledNativeBuiltin<"__spirv_SpecConstantComposite", OpenCL_std, SpecC // Async Copy and Prefetch builtin records: defm : DemangledNativeBuiltin<"async_work_group_copy", OpenCL_std, AsyncCopy, 4, 4, OpGroupAsyncCopy>; +defm : DemangledNativeBuiltin<"async_work_group_strided_copy", OpenCL_std, AsyncCopy, 5, 5, OpGroupAsyncCopy>; defm : DemangledNativeBuiltin<"__spirv_GroupAsyncCopy", OpenCL_std, AsyncCopy, 6, 6, OpGroupAsyncCopy>; defm : DemangledNativeBuiltin<"wait_group_events", OpenCL_std, AsyncCopy, 2, 2, OpGroupWaitEvents>; defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy, 3, 3, OpGroupWaitEvents>; diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 5ef0be1cab72..bbd25dc85f52 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -61,9 +61,6 @@ class SPIRVEmitIntrinsics DenseMap<Instruction *, Type *> AggrConstTypes; DenseSet<Instruction *> AggrStores; - // a registry of created Intrinsic::spv_assign_ptr_type instructions - DenseMap<Value *, CallInst *> AssignPtrTypeInstr; - // deduce element type of untyped pointers Type *deduceElementType(Value *I); Type *deduceElementTypeHelper(Value *I); @@ -98,14 +95,16 @@ class SPIRVEmitIntrinsics return B.CreateIntrinsic(IntrID, {Types}, Args); } + void buildAssignType(IRBuilder<> &B, Type *ElemTy, Value *Arg); void buildAssignPtr(IRBuilder<> &B, Type *ElemTy, Value *Arg); + void updateAssignType(CallInst *AssignCI, Value *Arg, Value *OfType); void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B); void processInstrAfterVisit(Instruction *I, IRBuilder<> &B); void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B); void insertAssignTypeIntrs(Instruction *I, IRBuilder<> &B); - void insertAssignTypeInstrForTargetExtTypes(TargetExtType *AssignedType, - Value *V, IRBuilder<> &B); + void insertAssignPtrTypeTargetExt(TargetExtType *AssignedType, Value *V, + IRBuilder<> &B); void replacePointerOperandWithPtrCast(Instruction *I, Value *Pointer, Type *ExpectedElementType, unsigned OperandToReplace, @@ -218,15 +217,39 @@ static inline void reportFatalOnTokenType(const Instruction *I) { false); } +void SPIRVEmitIntrinsics::buildAssignType(IRBuilder<> &B, Type *Ty, + Value *Arg) { + Value *OfType = PoisonValue::get(Ty); + CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type, + {Arg->getType()}, OfType, Arg, {}, B); + GR->addAssignPtrTypeInstr(Arg, AssignCI); +} + void SPIRVEmitIntrinsics::buildAssignPtr(IRBuilder<> &B, Type *ElemTy, Value *Arg) { - CallInst *AssignPtrTyCI = - buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Arg->getType()}, - Constant::getNullValue(ElemTy), Arg, - {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B); + Value *OfType = PoisonValue::get(ElemTy); + CallInst *AssignPtrTyCI = buildIntrWithMD( + Intrinsic::spv_assign_ptr_type, {Arg->getType()}, OfType, Arg, + {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B); GR->addDeducedElementType(AssignPtrTyCI, ElemTy); GR->addDeducedElementType(Arg, ElemTy); - AssignPtrTypeInstr[Arg] = AssignPtrTyCI; + GR->addAssignPtrTypeInstr(Arg, AssignPtrTyCI); +} + +void SPIRVEmitIntrinsics::updateAssignType(CallInst *AssignCI, Value *Arg, + Value *OfType) { + LLVMContext &Ctx = Arg->getContext(); + AssignCI->setArgOperand( + 1, MetadataAsValue::get( + Ctx, MDNode::get(Ctx, ValueAsMetadata::getConstant(OfType)))); + if (cast<IntrinsicInst>(AssignCI)->getIntrinsicID() != + Intrinsic::spv_assign_ptr_type) + return; + + // update association with the pointee type + Type *ElemTy = OfType->getType(); + GR->addDeducedElementType(AssignCI, ElemTy); + GR->addDeducedElementType(Arg, ElemTy); } // Set element pointer type to the given value of ValueTy and tries to @@ -513,19 +536,16 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(Instruction *I) { if (!Ty) { GR->addDeducedElementType(Op, KnownElemTy); // check if there is existing Intrinsic::spv_assign_ptr_type instruction - auto It = AssignPtrTypeInstr.find(Op); - if (It == AssignPtrTypeInstr.end()) { + CallInst *AssignCI = GR->findAssignPtrTypeInstr(Op); + if (AssignCI == nullptr) { Instruction *User = dyn_cast<Instruction>(Op->use_begin()->get()); setInsertPointSkippingPhis(B, User ? User->getNextNode() : I); CallInst *CI = buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {OpTy}, OpTyVal, Op, {B.getInt32(getPointerAddressSpace(OpTy))}, B); - AssignPtrTypeInstr[Op] = CI; + GR->addAssignPtrTypeInstr(Op, CI); } else { - It->second->setArgOperand( - 1, - MetadataAsValue::get( - Ctx, MDNode::get(Ctx, ValueAsMetadata::getConstant(OpTyVal)))); + updateAssignType(AssignCI, Op, OpTyVal); } } else { if (auto *OpI = dyn_cast<Instruction>(Op)) { @@ -559,7 +579,9 @@ void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old, if (isAssignTypeInstr(U)) { B.SetInsertPoint(U); SmallVector<Value *, 2> Args = {New, U->getOperand(1)}; - B.CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args); + CallInst *AssignCI = + B.CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args); + GR->addAssignPtrTypeInstr(New, AssignCI); U->eraseFromParent(); } else if (isMemInstrToReplace(U) || isa<ReturnInst>(U) || isa<CallInst>(U)) { @@ -751,33 +773,39 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) { return NewI; } -void SPIRVEmitIntrinsics::insertAssignTypeInstrForTargetExtTypes( +void SPIRVEmitIntrinsics::insertAssignPtrTypeTargetExt( TargetExtType *AssignedType, Value *V, IRBuilder<> &B) { - // Do not emit spv_assign_type if the V is of the AssignedType already. - if (V->getType() == AssignedType) - return; + Type *VTy = V->getType(); - // Do not emit spv_assign_type if there is one already targetting V. If the - // found spv_assign_type assigns a type different than AssignedType, report an - // error. Builtin types cannot be redeclared or casted. - for (auto User : V->users()) { - auto *II = dyn_cast<IntrinsicInst>(User); - if (!II || II->getIntrinsicID() != Intrinsic::spv_assign_type) - continue; + // A couple of sanity checks. + assert(isPointerTy(VTy) && "Expect a pointer type!"); + if (auto PType = dyn_cast<TypedPointerType>(VTy)) + if (PType->getElementType() != AssignedType) + report_fatal_error("Unexpected pointer element type!"); - MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1)); - Type *BuiltinType = - dyn_cast<ConstantAsMetadata>(VMD->getMetadata())->getType(); - if (BuiltinType != AssignedType) - report_fatal_error("Type mismatch " + BuiltinType->getTargetExtName() + - "/" + AssignedType->getTargetExtName() + - " for value " + V->getName(), - false); + CallInst *AssignCI = GR->findAssignPtrTypeInstr(V); + if (!AssignCI) { + buildAssignType(B, AssignedType, V); return; } - Constant *Const = UndefValue::get(AssignedType); - buildIntrWithMD(Intrinsic::spv_assign_type, {V->getType()}, Const, V, {}, B); + Type *CurrentType = + dyn_cast<ConstantAsMetadata>( + cast<MetadataAsValue>(AssignCI->getOperand(1))->getMetadata()) + ->getType(); + if (CurrentType == AssignedType) + return; + + // Builtin types cannot be redeclared or casted. + if (CurrentType->isTargetExtTy()) + report_fatal_error("Type mismatch " + CurrentType->getTargetExtName() + + "/" + AssignedType->getTargetExtName() + + " for value " + V->getName(), + false); + + // Our previous guess about the type seems to be wrong, let's update + // inferred type according to a new, more precise type information. + updateAssignType(AssignCI, V, PoisonValue::get(AssignedType)); } void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast( @@ -850,7 +878,7 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast( ExpectedElementTypeConst, Pointer, {B.getInt32(AddressSpace)}, B); GR->addDeducedElementType(CI, ExpectedElementType); GR->addDeducedElementType(Pointer, ExpectedElementType); - AssignPtrTypeInstr[Pointer] = CI; + GR->addAssignPtrTypeInstr(Pointer, CI); return; } @@ -929,8 +957,7 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I, for (unsigned OpIdx = 0; OpIdx < CI->arg_size(); OpIdx++) { Value *ArgOperand = CI->getArgOperand(OpIdx); - if (!isa<PointerType>(ArgOperand->getType()) && - !isa<TypedPointerType>(ArgOperand->getType())) + if (!isPointerTy(ArgOperand->getType())) continue; // Constants (nulls/undefs) are handled in insertAssignPtrTypeIntrs() @@ -952,8 +979,8 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I, continue; if (ExpectedType->isTargetExtTy()) - insertAssignTypeInstrForTargetExtTypes(cast<TargetExtType>(ExpectedType), - ArgOperand, B); + insertAssignPtrTypeTargetExt(cast<TargetExtType>(ExpectedType), + ArgOperand, B); else replacePointerOperandWithPtrCast(CI, ArgOperand, ExpectedType, OpIdx, B); } @@ -1145,7 +1172,7 @@ void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I, CallInst *CI = buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {I->getType()}, EltTyConst, I, {B.getInt32(AddressSpace)}, B); GR->addDeducedElementType(CI, ElemTy); - AssignPtrTypeInstr[I] = CI; + GR->addAssignPtrTypeInstr(I, CI); } void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, @@ -1164,20 +1191,32 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, TypeToAssign = It->second; } } - Constant *Const = UndefValue::get(TypeToAssign); - buildIntrWithMD(Intrinsic::spv_assign_type, {Ty}, Const, I, {}, B); + buildAssignType(B, TypeToAssign, I); } for (const auto &Op : I->operands()) { if (isa<ConstantPointerNull>(Op) || isa<UndefValue>(Op) || // Check GetElementPtrConstantExpr case. (isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) { setInsertPointSkippingPhis(B, I); - if (isa<UndefValue>(Op) && Op->getType()->isAggregateType()) - buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op, - UndefValue::get(B.getInt32Ty()), {}, B); - else if (!isa<Instruction>(Op)) - buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op, {}, - B); + Type *OpTy = Op->getType(); + if (isa<UndefValue>(Op) && OpTy->isAggregateType()) { + CallInst *AssignCI = + buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op, + UndefValue::get(B.getInt32Ty()), {}, B); + GR->addAssignPtrTypeInstr(Op, AssignCI); + } else if (!isa<Instruction>(Op)) { + Type *OpTy = Op->getType(); + if (auto PType = dyn_cast<TypedPointerType>(OpTy)) { + buildAssignPtr(B, PType->getElementType(), Op); + } else if (isPointerTy(OpTy)) { + Type *ElemTy = GR->findDeducedElementType(Op); + buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op), Op); + } else { + CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type, + {OpTy}, Op, Op, {}, B); + GR->addAssignPtrTypeInstr(Op, AssignCI); + } + } } } } @@ -1368,14 +1407,12 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { continue; insertAssignPtrTypeIntrs(I, B); + deduceOperandElementType(I); insertAssignTypeIntrs(I, B); insertPtrCastOrAssignTypeInstr(I, B); insertSpirvDecorations(I, B); } - for (auto &I : instructions(Func)) - deduceOperandElementType(&I); - for (auto *I : Worklist) { TrackConstants = true; if (!I->getType()->isVoidTy() || isa<StoreInst>(I)) diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index ef0973d03d15..db01f68f48de 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -73,8 +73,11 @@ class SPIRVGlobalRegistry { // untyped pointers. DenseMap<Value *, Type *> DeducedElTys; // Maps composite values to deduced types where untyped pointers are replaced - // with typed ones + // with typed ones. DenseMap<Value *, Type *> DeducedNestedTys; + // Maps values to "assign type" calls, thus being a registry of created + // Intrinsic::spv_assign_ptr_type instructions. + DenseMap<Value *, CallInst *> AssignPtrTypeInstr; // Add a new OpTypeXXX instruction without checking for duplicates. SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder, @@ -149,6 +152,17 @@ public: return It == FunResPointerTypes.end() ? nullptr : It->second; } + // A registry of "assign type" records: + // - Add a record. + void addAssignPtrTypeInstr(Value *Val, CallInst *AssignPtrTyCI) { + AssignPtrTypeInstr[Val] = AssignPtrTyCI; + } + // - Find a record. + CallInst *findAssignPtrTypeInstr(const Value *Val) { + auto It = AssignPtrTypeInstr.find(Val); + return It == AssignPtrTypeInstr.end() ? nullptr : It->second; + } + // Deduced element types of untyped pointers and composites: // - Add a record to the map of deduced element types. void addDeducedElementType(Value *Val, Type *Ty) { DeducedElTys[Val] = Ty; } diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 3d536085b78a..a0a253c23b1e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -417,7 +417,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineInstr *Def = MRI.getVRegDef(Reg); assert(Def && "Expecting an instruction that defines the register"); // G_GLOBAL_VALUE already has type info. - if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE) + if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE && + Def->getOpcode() != SPIRV::ASSIGN_TYPE) insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB, MF.getRegInfo()); ToErase.push_back(&MI); @@ -427,7 +428,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineInstr *Def = MRI.getVRegDef(Reg); assert(Def && "Expecting an instruction that defines the register"); // G_GLOBAL_VALUE already has type info. - if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE) + if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE && + Def->getOpcode() != SPIRV::ASSIGN_TYPE) insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MF.getRegInfo()); ToErase.push_back(&MI); } else if (MIOp == TargetOpcode::G_CONSTANT || diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 8e2063121e00..f5bc584ac4e1 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -178,14 +178,15 @@ static wasm::WasmLimits DefaultLimits() { } static MCSymbolWasm *GetOrCreateFunctionTableSymbol(MCContext &Ctx, - const StringRef &Name) { + const StringRef &Name, + bool is64) { MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name)); if (Sym) { if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); - Sym->setFunctionTable(); + Sym->setFunctionTable(is64); // The default function table is synthesized by the linker. Sym->setUndefined(); } @@ -258,7 +259,7 @@ public: MCAsmParserExtension::Initialize(Parser); DefaultFunctionTable = GetOrCreateFunctionTableSymbol( - getContext(), "__indirect_function_table"); + getContext(), "__indirect_function_table", is64); if (!STI->checkFeatures("+reference-types")) DefaultFunctionTable->setOmitFromLinkingSection(); } @@ -508,7 +509,7 @@ public: auto &Tok = Lexer.getTok(); if (Tok.is(AsmToken::Identifier)) { auto *Sym = - GetOrCreateFunctionTableSymbol(getContext(), Tok.getString()); + GetOrCreateFunctionTableSymbol(getContext(), Tok.getString(), is64); const auto *Val = MCSymbolRefExpr::create(Sym, getContext()); *Op = std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::Symbol, Tok.getLoc(), Tok.getEndLoc(), @@ -836,6 +837,9 @@ public: // symbol auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE); + if (is64) { + Limits.Flags |= wasm::WASM_LIMITS_FLAG_IS_64; + } wasm::WasmTableType Type = {*ElemType, Limits}; WasmSym->setTableType(Type); TOut.emitTableType(WasmSym); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index 5e7279808cce..c5a047ee47d7 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -108,8 +108,9 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol( if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { + bool is64 = Subtarget && Subtarget->getTargetTriple().isArch64Bit(); Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); - Sym->setFunctionTable(); + Sym->setFunctionTable(is64); // The default function table is synthesized by the linker. Sym->setUndefined(); } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 0bf3294af92a..3933e82b718f 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -5120,6 +5120,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_tileloaddt164_internal: { if (!Subtarget->hasAMXTILE()) break; + auto *MFI = + CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal ? X86::PTILELOADDV : X86::PTILELOADDT1V; @@ -5201,6 +5204,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; } case Intrinsic::x86_tilestored64_internal: { + auto *MFI = + CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); unsigned Opc = X86::PTILESTOREDV; // _tile_stored_internal(row, col, buf, STRIDE, c) SDValue Base = Node->getOperand(4); @@ -5228,6 +5234,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_tilestored64: { if (!Subtarget->hasAMXTILE()) break; + auto *MFI = + CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); unsigned Opc; switch (IntNo) { default: llvm_unreachable("Unexpected intrinsic!"); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7d30de15f84d..3fbab3af32bb 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -615,6 +615,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSIN, VT, Action); setOperationAction(ISD::FCOS, VT, Action); setOperationAction(ISD::FSINCOS, VT, Action); + setOperationAction(ISD::FTAN, VT, Action); setOperationAction(ISD::FSQRT, VT, Action); setOperationAction(ISD::FPOW, VT, Action); setOperationAction(ISD::FLOG, VT, Action); @@ -833,9 +834,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Always expand sin/cos functions even though x87 has an instruction. + // clang-format off setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); + setOperationAction(ISD::FTAN , MVT::f80, Expand); + // clang-format on setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); @@ -888,11 +892,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEG, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + // clang-format off setOperationAction(ISD::FSIN, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall); setOperationAction(ISD::FCOS, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall); setOperationAction(ISD::FSINCOS, MVT::f128, LibCall); + setOperationAction(ISD::FTAN, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FTAN, MVT::f128, LibCall); + // clang-format on // No STRICT_FSINCOS setOperationAction(ISD::FSQRT, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); @@ -944,9 +952,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16, MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { + // clang-format off setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FTAN, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); @@ -956,6 +966,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FEXP10, VT, Expand); + // clang-format on } // First set operation action for all vector types to either promote @@ -2473,7 +2484,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // function casting to f64 and calling `fmod`. if (Subtarget.is32Bit() && (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) - for (ISD::NodeType Op : + // clang-format off + for (ISD::NodeType Op : {ISD::FCEIL, ISD::STRICT_FCEIL, ISD::FCOS, ISD::STRICT_FCOS, ISD::FEXP, ISD::STRICT_FEXP, @@ -2482,9 +2494,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::FLOG, ISD::STRICT_FLOG, ISD::FLOG10, ISD::STRICT_FLOG10, ISD::FPOW, ISD::STRICT_FPOW, - ISD::FSIN, ISD::STRICT_FSIN}) + ISD::FSIN, ISD::STRICT_FSIN, + ISD::FTAN, ISD::STRICT_FTAN}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); + // clang-format on // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, @@ -26776,7 +26790,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case Intrinsic::swift_async_context_addr: { SDLoc dl(Op); auto &MF = DAG.getMachineFunction(); - auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); + auto *X86FI = MF.getInfo<X86MachineFunctionInfo>(); if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) { MF.getFrameInfo().setFrameAddressIsTaken(true); X86FI->setHasSwiftAsyncContext(true); @@ -36781,7 +36795,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case TargetOpcode::PREALLOCATED_SETUP: { assert(Subtarget.is32Bit() && "preallocated only used in 32-bit"); - auto MFI = MF->getInfo<X86MachineFunctionInfo>(); + auto *MFI = MF->getInfo<X86MachineFunctionInfo>(); MFI->setHasPreallocatedCall(true); int64_t PreallocatedId = MI.getOperand(0).getImm(); size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId); @@ -36798,7 +36812,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit"); int64_t PreallocatedId = MI.getOperand(1).getImm(); int64_t ArgIdx = MI.getOperand(2).getImm(); - auto MFI = MF->getInfo<X86MachineFunctionInfo>(); + auto *MFI = MF->getInfo<X86MachineFunctionInfo>(); size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx]; LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx << ", arg offset " << ArgOffset << "\n"); @@ -36841,6 +36855,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, unsigned Imm = MI.getOperand(0).getImm(); BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm)); MI.eraseFromParent(); // The pseudo is gone now. + auto *MFI = MF->getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); + return BB; + } + case X86::PTILEZEROV: { + auto *MFI = MF->getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); return BB; } case X86::PTILELOADD: diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index c47bee070e04..99deacc811a1 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -74,7 +74,7 @@ let SchedRW = [WriteSystem] in { GR16:$src2, opaquemem:$src3, TILE:$src4), []>; let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1, - canFoldAsLoad = 1 in + canFoldAsLoad = 1, usesCustomInserter = 1 in def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2), [(set TILE:$dst, (int_x86_tilezero_internal GR16:$src1, GR16:$src2))]>; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index b69058787a4e..079ac983a8a0 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -92,6 +92,14 @@ static bool isAMXIntrinsic(Value *I) { return false; } +static bool containsAMXCode(Function &F) { + for (BasicBlock &BB : F) + for (Instruction &I : BB) + if (I.getType()->isX86_AMXTy()) + return true; + return false; +} + static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB, Type *Ty) { Function &F = *BB->getParent(); @@ -1230,6 +1238,14 @@ public: } bool runOnFunction(Function &F) override { + // Performance optimization: most code doesn't use AMX, so return early if + // there are no instructions that produce AMX values. This is sufficient, as + // AMX arguments and constants are not allowed -- so any producer of an AMX + // value must be an instruction. + // TODO: find a cheaper way for this, without looking at all instructions. + if (!containsAMXCode(F)) + return false; + bool C = false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); TargetLibraryInfo *TLI = diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp index f27676a27e86..613722b398f4 100644 --- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp +++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp @@ -19,6 +19,7 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -71,6 +72,10 @@ FunctionPass *llvm::createX86LowerTileCopyPass() { } bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) { + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + if (FuncInfo->getAMXProgModel() != AMXProgModelEnum::ManagedRA) + return false; + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); const X86InstrInfo *TII = ST.getInstrInfo(); const TargetRegisterInfo *TRI = ST.getRegisterInfo(); @@ -81,26 +86,8 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; for (MachineBasicBlock &MBB : MF) { - // There won't be a tile copy if neither tile register live in nor live out. - bool HasTileCopy = false; - for (const auto &LI : MBB.liveins()) { - if (TILERegs.test(LI.PhysReg)) { - HasTileCopy = true; - break; - } - } LiveRegUnits UsedRegs(*TRI); UsedRegs.addLiveOuts(MBB); - if (!HasTileCopy) { - for (auto RegT : TILERegs.set_bits()) { - if (UsedRegs.available(RegT)) { - HasTileCopy = true; - break; - } - } - } - if (!HasTileCopy) - continue; for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) { UsedRegs.stepBackward(MI); if (!MI.isCopy()) diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index f6e853270e07..8aaa49945f9d 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -21,6 +21,8 @@ namespace llvm { +enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 }; + /// X86MachineFunctionInfo - This class is derived from MachineFunction and /// contains private X86 target-specific information for each MachineFunction. class X86MachineFunctionInfo : public MachineFunctionInfo { @@ -96,6 +98,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// used to address arguments in a function using a base pointer. int SEHFramePtrSaveIndex = 0; + /// The AMX programing model used in the function. + AMXProgModelEnum AMXProgModel = AMXProgModelEnum::None; + /// True if this function has a subset of CSRs that is handled explicitly via /// copies. bool IsSplitCSR = false; @@ -219,6 +224,13 @@ public: int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; } void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; } + AMXProgModelEnum getAMXProgModel() const { return AMXProgModel; } + void setAMXProgModel(AMXProgModelEnum Model) { + assert((AMXProgModel == AMXProgModelEnum::None || AMXProgModel == Model) && + "mixed model is not supported"); + AMXProgModel = Model; + } + SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() { return ForwardedMustTailRegParms; } diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index 2d296771b1c0..186d4d84c251 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -620,11 +620,11 @@ def : WriteRes<WriteNop, []>; // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -defm : ICXWriteResPair<WriteFHAdd, [ICXPort5,ICXPort015], 6, [2,1], 3, 6>; -defm : ICXWriteResPair<WriteFHAddY, [ICXPort5,ICXPort015], 6, [2,1], 3, 7>; +defm : ICXWriteResPair<WriteFHAdd, [ICXPort5,ICXPort01], 6, [2,1], 3, 6>; +defm : ICXWriteResPair<WriteFHAddY, [ICXPort5,ICXPort01], 6, [2,1], 3, 7>; defm : ICXWriteResPair<WritePHAdd, [ICXPort5,ICXPort05], 3, [2,1], 3, 5>; -defm : ICXWriteResPair<WritePHAddX, [ICXPort5,ICXPort015], 3, [2,1], 3, 6>; -defm : ICXWriteResPair<WritePHAddY, [ICXPort5,ICXPort015], 3, [2,1], 3, 7>; +defm : ICXWriteResPair<WritePHAddX, [ICXPort15,ICXPort015], 3, [2,1], 3, 6>; +defm : ICXWriteResPair<WritePHAddY, [ICXPort15,ICXPort015], 3, [2,1], 3, 7>; // Remaining instrs. @@ -886,7 +886,7 @@ def ICXWriteResGroup37 : SchedWriteRes<[ICXPort0,ICXPort5]> { } def: InstRW<[ICXWriteResGroup37], (instregex "MMX_PH(ADD|SUB)SWrr")>; -def ICXWriteResGroup38 : SchedWriteRes<[ICXPort5,ICXPort01]> { +def ICXWriteResGroup38 : SchedWriteRes<[ICXPort15,ICXPort01]> { let Latency = 3; let NumMicroOps = 3; let ReleaseAtCycles = [2,1]; @@ -1739,13 +1739,13 @@ def ICXWriteResGroup137 : SchedWriteRes<[ICXPort23,ICXPort01]> { def: InstRW<[ICXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIrm", "(V?)CVTPS2PDrm")>; -def ICXWriteResGroup143 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> { +def ICXWriteResGroup143 : SchedWriteRes<[ICXPort15,ICXPort01,ICXPort23]> { let Latency = 9; let NumMicroOps = 4; let ReleaseAtCycles = [2,1,1]; } -def: InstRW<[ICXWriteResGroup143], (instregex "(V?)PHADDSWrm", - "(V?)PHSUBSWrm")>; +def: InstRW<[ICXWriteResGroup143], (instrs PHADDSWrm, VPHADDSWrm, + PHSUBSWrm, VPHSUBSWrm)>; def ICXWriteResGroup146 : SchedWriteRes<[ICXPort1,ICXPort6,ICXPort23,ICXPort0156]> { let Latency = 9; @@ -1842,7 +1842,7 @@ def: InstRW<[ICXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)", "VPEXPANDDZ128rm(b?)", "VPEXPANDQZ128rm(b?)")>; -def ICXWriteResGroup154 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> { +def ICXWriteResGroup154 : SchedWriteRes<[ICXPort15,ICXPort01,ICXPort23]> { let Latency = 10; let NumMicroOps = 4; let ReleaseAtCycles = [2,1,1]; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index a7dff0ecbcd9..4fded44085e8 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -615,8 +615,8 @@ def : WriteRes<WriteNop, []>; // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort015], 6, [2,1], 3, 6>; -defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort015], 6, [2,1], 3, 7>; +defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort01], 6, [2,1], 3, 6>; +defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort01], 6, [2,1], 3, 7>; defm : SKXWriteResPair<WritePHAdd, [SKXPort5,SKXPort05], 3, [2,1], 3, 5>; defm : SKXWriteResPair<WritePHAddX, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>; defm : SKXWriteResPair<WritePHAddY, [SKXPort5,SKXPort015], 3, [2,1], 3, 7>; diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 68155acd9e5b..b3b8486c604b 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -302,6 +302,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { .Case("0x805", "cortex-a76") // Kryo 4xx/5xx Silver .Case("0xc00", "falkor") .Case("0xc01", "saphira") + .Case("0x001", "oryon-1") .Default("generic"); if (Implementer == "0x53") { // Samsung Electronics Co., Ltd. // The Exynos chips have a convoluted ID scheme that doesn't seem to follow diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 7464237d26d4..60a784ef002f 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -124,6 +124,7 @@ constexpr GPUInfo AMDGCNGPUs[] = { {{"gfx1103"}, {"gfx1103"}, GK_GFX1103, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1150"}, {"gfx1150"}, GK_GFX1150, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1151"}, {"gfx1151"}, GK_GFX1151, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, + {{"gfx1152"}, {"gfx1152"}, GK_GFX1152, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1200"}, {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1201"}, {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, @@ -275,6 +276,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) { case GK_GFX1103: return {11, 0, 3}; case GK_GFX1150: return {11, 5, 0}; case GK_GFX1151: return {11, 5, 1}; + case GK_GFX1152: return {11, 5, 2}; case GK_GFX1200: return {12, 0, 0}; case GK_GFX1201: return {12, 0, 1}; @@ -341,6 +343,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["image-insts"] = true; Features["fp8-conversion-insts"] = true; break; + case GK_GFX1152: case GK_GFX1151: case GK_GFX1150: case GK_GFX1103: @@ -542,6 +545,7 @@ static bool isWave32Capable(StringRef GPU, const Triple &T) { switch (parseArchAMDGCN(GPU)) { case GK_GFX1201: case GK_GFX1200: + case GK_GFX1152: case GK_GFX1151: case GK_GFX1150: case GK_GFX1103: diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 9a5732dca5b7..549d03645f93 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -419,7 +419,8 @@ struct AAReturnedFromReturnedValues : public BaseType { /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { StateType S(StateType::getBestState(this->getState())); - clampReturnedValueStates<AAType, StateType, IRAttributeKind, RecurseForSelectAndPHI>( + clampReturnedValueStates<AAType, StateType, IRAttributeKind, + RecurseForSelectAndPHI>( A, *this, S, PropagateCallBaseContext ? this->getCallBaseContext() : nullptr); // TODO: If we know we visited all returned values, thus no are assumed @@ -6973,10 +6974,9 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { if (AI.LibraryFunctionId != LibFunc___kmpc_alloc_shared) { Instruction *CtxI = isa<InvokeInst>(AI.CB) ? AI.CB : AI.CB->getNextNode(); if (!Explorer || !Explorer->findInContextOf(UniqueFree, CtxI)) { - LLVM_DEBUG( - dbgs() - << "[H2S] unique free call might not be executed with the allocation " - << *UniqueFree << "\n"); + LLVM_DEBUG(dbgs() << "[H2S] unique free call might not be executed " + "with the allocation " + << *UniqueFree << "\n"); return false; } } @@ -10406,11 +10406,12 @@ struct AANoFPClassFloating : public AANoFPClassImpl { struct AANoFPClassReturned final : AAReturnedFromReturnedValues<AANoFPClass, AANoFPClassImpl, - AANoFPClassImpl::StateType, false, Attribute::None, false> { + AANoFPClassImpl::StateType, false, + Attribute::None, false> { AANoFPClassReturned(const IRPosition &IRP, Attributor &A) : AAReturnedFromReturnedValues<AANoFPClass, AANoFPClassImpl, - AANoFPClassImpl::StateType, false, Attribute::None, false>( - IRP, A) {} + AANoFPClassImpl::StateType, false, + Attribute::None, false>(IRP, A) {} /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt index 5fbdbc3a014f..92a9697720ef 100644 --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -12,6 +12,7 @@ add_llvm_component_library(LLVMipo DeadArgumentElimination.cpp ElimAvailExtern.cpp EmbedBitcodePass.cpp + ExpandVariadics.cpp ExtractGV.cpp ForceFunctionAttrs.cpp FunctionAttrs.cpp diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp new file mode 100644 index 000000000000..d340bc041ccd --- /dev/null +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -0,0 +1,1012 @@ +//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is an optimization pass for variadic functions. If called from codegen, +// it can serve as the implementation of variadic functions for a given target. +// +// The strategy is to turn the ... part of a variadic function into a va_list +// and fix up the call sites. The majority of the pass is target independent. +// The exceptions are the va_list type itself and the rules for where to store +// variables in memory such that va_arg can iterate over them given a va_list. +// +// The majority of the plumbing is splitting the variadic function into a +// single basic block that packs the variadic arguments into a va_list and +// a second function that does the work of the original. That packing is +// exactly what is done by va_start. Further, the transform from ... to va_list +// replaced va_start with an operation to copy a va_list from the new argument, +// which is exactly a va_copy. This is useful for reducing target-dependence. +// +// A va_list instance is a forward iterator, where the primary operation va_arg +// is dereference-then-increment. This interface forces significant convergent +// evolution between target specific implementations. The variation in runtime +// data layout is limited to that representable by the iterator, parameterised +// by the type passed to the va_arg instruction. +// +// Therefore the majority of the target specific subtlety is packing arguments +// into a stack allocated buffer such that a va_list can be initialised with it +// and the va_arg expansion for the target will find the arguments at runtime. +// +// The aggregate effect is to unblock other transforms, most critically the +// general purpose inliner. Known calls to variadic functions become zero cost. +// +// Consistency with clang is primarily tested by emitting va_arg using clang +// then expanding the variadic functions using this pass, followed by trying +// to constant fold the functions to no-ops. +// +// Target specific behaviour is tested in IR - mainly checking that values are +// put into positions in call frames that make sense for that particular target. +// +// There is one "clever" invariant in use. va_start intrinsics that are not +// within a varidic functions are an error in the IR verifier. When this +// transform moves blocks from a variadic function into a fixed arity one, it +// moves va_start intrinsics along with everything else. That means that the +// va_start intrinsics that need to be rewritten to use the trailing argument +// are exactly those that are in non-variadic functions so no further state +// is needed to distinguish those that need to be rewritten. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/ExpandVariadics.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +#define DEBUG_TYPE "expand-variadics" + +using namespace llvm; + +namespace { + +cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption( + DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE), + cl::init(ExpandVariadicsMode::Unspecified), + cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified", + "Use the implementation defaults"), + clEnumValN(ExpandVariadicsMode::Disable, "disable", + "Disable the pass entirely"), + clEnumValN(ExpandVariadicsMode::Optimize, "optimize", + "Optimise without changing ABI"), + clEnumValN(ExpandVariadicsMode::Lowering, "lowering", + "Change variadic calling convention"))); + +bool commandLineOverride() { + return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified; +} + +// Instances of this class encapsulate the target-dependant behaviour as a +// function of triple. Implementing a new ABI is adding a case to the switch +// in create(llvm::Triple) at the end of this file. +// This class may end up instantiated in TargetMachine instances, keeping it +// here for now until enough targets are implemented for the API to evolve. +class VariadicABIInfo { +protected: + VariadicABIInfo() = default; + +public: + static std::unique_ptr<VariadicABIInfo> create(const Triple &T); + + // Allow overriding whether the pass runs on a per-target basis + virtual bool enableForTarget() = 0; + + // Whether a valist instance is passed by value or by address + // I.e. does it need to be alloca'ed and stored into, or can + // it be passed directly in a SSA register + virtual bool vaListPassedInSSARegister() = 0; + + // The type of a va_list iterator object + virtual Type *vaListType(LLVMContext &Ctx) = 0; + + // The type of a va_list as a function argument as lowered by C + virtual Type *vaListParameterType(Module &M) = 0; + + // Initialize an allocated va_list object to point to an already + // initialized contiguous memory region. + // Return the value to pass as the va_list argument + virtual Value *initializeVaList(Module &M, LLVMContext &Ctx, + IRBuilder<> &Builder, AllocaInst *VaList, + Value *Buffer) = 0; + + struct VAArgSlotInfo { + Align DataAlign; // With respect to the call frame + bool Indirect; // Passed via a pointer + }; + virtual VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) = 0; + + // Targets implemented so far all have the same trivial lowering for these + bool vaEndIsNop() { return true; } + bool vaCopyIsMemcpy() { return true; } + + virtual ~VariadicABIInfo() = default; +}; + +// Module implements getFunction() which returns nullptr on missing declaration +// and getOrInsertFunction which creates one when absent. Intrinsics.h only +// implements getDeclaration which creates one when missing. Checking whether +// an intrinsic exists thus inserts it in the module and it then needs to be +// deleted again to clean up. +// The right name for the two functions on intrinsics would match Module::, +// but doing that in a single change would introduce nullptr dereferences +// where currently there are none. The minimal collateral damage approach +// would split the change over a release to help downstream branches. As it +// is unclear what approach will be preferred, implementing the trivial +// function here in the meantime to decouple from that discussion. +Function *getPreexistingDeclaration(Module *M, Intrinsic::ID Id, + ArrayRef<Type *> Tys = {}) { + auto *FT = Intrinsic::getType(M->getContext(), Id, Tys); + return M->getFunction(Tys.empty() ? Intrinsic::getName(Id) + : Intrinsic::getName(Id, Tys, M, FT)); +} + +class ExpandVariadics : public ModulePass { + + // The pass construction sets the default to optimize when called from middle + // end and lowering when called from the backend. The command line variable + // overrides that. This is useful for testing and debugging. It also allows + // building an applications with variadic functions wholly removed if one + // has sufficient control over the dependencies, e.g. a statically linked + // clang that has no variadic function calls remaining in the binary. + +public: + static char ID; + const ExpandVariadicsMode Mode; + std::unique_ptr<VariadicABIInfo> ABI; + + ExpandVariadics(ExpandVariadicsMode Mode) + : ModulePass(ID), + Mode(commandLineOverride() ? ExpandVariadicsModeOption : Mode) {} + + StringRef getPassName() const override { return "Expand variadic functions"; } + + bool rewriteABI() { return Mode == ExpandVariadicsMode::Lowering; } + + bool runOnModule(Module &M) override; + + bool runOnFunction(Module &M, IRBuilder<> &Builder, Function *F); + + Function *replaceAllUsesWithNewDeclaration(Module &M, + Function *OriginalFunction); + + Function *deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder, + Function *OriginalFunction); + + Function *defineVariadicWrapper(Module &M, IRBuilder<> &Builder, + Function *VariadicWrapper, + Function *FixedArityReplacement); + + bool expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, FunctionType *, + Function *NF); + + // The intrinsic functions va_copy and va_end are removed unconditionally. + // They correspond to a memcpy and a no-op on all implemented targets. + // The va_start intrinsic is removed from basic blocks that were not created + // by this pass, some may remain if needed to maintain the external ABI. + + template <Intrinsic::ID ID, typename InstructionType> + bool expandIntrinsicUsers(Module &M, IRBuilder<> &Builder, + PointerType *IntrinsicArgType) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + if (Function *Intrinsic = + getPreexistingDeclaration(&M, ID, {IntrinsicArgType})) { + for (User *U : make_early_inc_range(Intrinsic->users())) + if (auto *I = dyn_cast<InstructionType>(U)) + Changed |= expandVAIntrinsicCall(Builder, DL, I); + + if (Intrinsic->use_empty()) + Intrinsic->eraseFromParent(); + } + return Changed; + } + + bool expandVAIntrinsicUsersWithAddrspace(Module &M, IRBuilder<> &Builder, + unsigned Addrspace) { + auto &Ctx = M.getContext(); + PointerType *IntrinsicArgType = PointerType::get(Ctx, Addrspace); + bool Changed = false; + + // expand vastart before vacopy as vastart may introduce a vacopy + Changed |= expandIntrinsicUsers<Intrinsic::vastart, VAStartInst>( + M, Builder, IntrinsicArgType); + Changed |= expandIntrinsicUsers<Intrinsic::vaend, VAEndInst>( + M, Builder, IntrinsicArgType); + Changed |= expandIntrinsicUsers<Intrinsic::vacopy, VACopyInst>( + M, Builder, IntrinsicArgType); + return Changed; + } + + bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL, + VAStartInst *Inst); + + bool expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &, + VAEndInst *Inst); + + bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL, + VACopyInst *Inst); + + FunctionType *inlinableVariadicFunctionType(Module &M, FunctionType *FTy) { + // The type of "FTy" with the ... removed and a va_list appended + SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end()); + ArgTypes.push_back(ABI->vaListParameterType(M)); + return FunctionType::get(FTy->getReturnType(), ArgTypes, + /*IsVarArgs=*/false); + } + + static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL, + AllocaInst *Alloced) { + std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL); + uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0; + return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt); + } + + bool expansionApplicableToFunction(Module &M, Function *F) { + if (F->isIntrinsic() || !F->isVarArg() || + F->hasFnAttribute(Attribute::Naked)) + return false; + + if (F->getCallingConv() != CallingConv::C) + return false; + + if (rewriteABI()) + return true; + + if (!F->hasExactDefinition()) + return false; + + return true; + } + + bool expansionApplicableToFunctionCall(CallBase *CB) { + if (CallInst *CI = dyn_cast<CallInst>(CB)) { + if (CI->isMustTailCall()) { + // Cannot expand musttail calls + return false; + } + + if (CI->getCallingConv() != CallingConv::C) + return false; + + return true; + } + + if (isa<InvokeInst>(CB)) { + // Invoke not implemented in initial implementation of pass + return false; + } + + // Other unimplemented derivative of CallBase + return false; + } + + class ExpandedCallFrame { + // Helper for constructing an alloca instance containing the arguments bound + // to the variadic ... parameter, rearranged to allow indexing through a + // va_list iterator + enum { N = 4 }; + SmallVector<Type *, N> FieldTypes; + enum Tag { Store, Memcpy, Padding }; + SmallVector<std::tuple<Value *, uint64_t, Tag>, N> Source; + + template <Tag tag> void append(Type *FieldType, Value *V, uint64_t Bytes) { + FieldTypes.push_back(FieldType); + Source.push_back({V, Bytes, tag}); + } + + public: + void store(LLVMContext &Ctx, Type *T, Value *V) { append<Store>(T, V, 0); } + + void memcpy(LLVMContext &Ctx, Type *T, Value *V, uint64_t Bytes) { + append<Memcpy>(T, V, Bytes); + } + + void padding(LLVMContext &Ctx, uint64_t By) { + append<Padding>(ArrayType::get(Type::getInt8Ty(Ctx), By), nullptr, 0); + } + + size_t size() const { return FieldTypes.size(); } + bool empty() const { return FieldTypes.empty(); } + + StructType *asStruct(LLVMContext &Ctx, StringRef Name) { + const bool IsPacked = true; + return StructType::create(Ctx, FieldTypes, + (Twine(Name) + ".vararg").str(), IsPacked); + } + + void initializeStructAlloca(const DataLayout &DL, IRBuilder<> &Builder, + AllocaInst *Alloced) { + + StructType *VarargsTy = cast<StructType>(Alloced->getAllocatedType()); + + for (size_t I = 0; I < size(); I++) { + + auto [V, bytes, tag] = Source[I]; + + if (tag == Padding) { + assert(V == nullptr); + continue; + } + + auto Dst = Builder.CreateStructGEP(VarargsTy, Alloced, I); + + assert(V != nullptr); + + if (tag == Store) + Builder.CreateStore(V, Dst); + + if (tag == Memcpy) + Builder.CreateMemCpy(Dst, {}, V, {}, bytes); + } + } + }; +}; + +bool ExpandVariadics::runOnModule(Module &M) { + bool Changed = false; + if (Mode == ExpandVariadicsMode::Disable) + return Changed; + + Triple TT(M.getTargetTriple()); + ABI = VariadicABIInfo::create(TT); + if (!ABI) + return Changed; + + if (!ABI->enableForTarget()) + return Changed; + + auto &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + IRBuilder<> Builder(Ctx); + + // Lowering needs to run on all functions exactly once. + // Optimize could run on functions containing va_start exactly once. + for (Function &F : make_early_inc_range(M)) + Changed |= runOnFunction(M, Builder, &F); + + // After runOnFunction, all known calls to known variadic functions have been + // replaced. va_start intrinsics are presently (and invalidly!) only present + // in functions that used to be variadic and have now been replaced to take a + // va_list instead. If lowering as opposed to optimising, calls to unknown + // variadic functions have also been replaced. + + { + // 0 and AllocaAddrSpace are sufficient for the targets implemented so far + unsigned Addrspace = 0; + Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace); + + Addrspace = DL.getAllocaAddrSpace(); + if (Addrspace != 0) + Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace); + } + + if (Mode != ExpandVariadicsMode::Lowering) + return Changed; + + for (Function &F : make_early_inc_range(M)) { + if (F.isDeclaration()) + continue; + + // Now need to track down indirect calls. Can't find those + // by walking uses of variadic functions, need to crawl the instruction + // stream. Fortunately this is only necessary for the ABI rewrite case. + for (BasicBlock &BB : F) { + for (Instruction &I : make_early_inc_range(BB)) { + if (CallBase *CB = dyn_cast<CallBase>(&I)) { + if (CB->isIndirectCall()) { + FunctionType *FTy = CB->getFunctionType(); + if (FTy->isVarArg()) + Changed |= expandCall(M, Builder, CB, FTy, 0); + } + } + } + } + } + + return Changed; +} + +bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder, + Function *OriginalFunction) { + bool Changed = false; + + if (!expansionApplicableToFunction(M, OriginalFunction)) + return Changed; + + [[maybe_unused]] const bool OriginalFunctionIsDeclaration = + OriginalFunction->isDeclaration(); + assert(rewriteABI() || !OriginalFunctionIsDeclaration); + + // Declare a new function and redirect every use to that new function + Function *VariadicWrapper = + replaceAllUsesWithNewDeclaration(M, OriginalFunction); + assert(VariadicWrapper->isDeclaration()); + assert(OriginalFunction->use_empty()); + + // Create a new function taking va_list containing the implementation of the + // original + Function *FixedArityReplacement = + deriveFixedArityReplacement(M, Builder, OriginalFunction); + assert(OriginalFunction->isDeclaration()); + assert(FixedArityReplacement->isDeclaration() == + OriginalFunctionIsDeclaration); + assert(VariadicWrapper->isDeclaration()); + + // Create a single block forwarding wrapper that turns a ... into a va_list + [[maybe_unused]] Function *VariadicWrapperDefine = + defineVariadicWrapper(M, Builder, VariadicWrapper, FixedArityReplacement); + assert(VariadicWrapperDefine == VariadicWrapper); + assert(!VariadicWrapper->isDeclaration()); + + // We now have: + // 1. the original function, now as a declaration with no uses + // 2. a variadic function that unconditionally calls a fixed arity replacement + // 3. a fixed arity function equivalent to the original function + + // Replace known calls to the variadic with calls to the va_list equivalent + for (User *U : make_early_inc_range(VariadicWrapper->users())) { + if (CallBase *CB = dyn_cast<CallBase>(U)) { + Value *calledOperand = CB->getCalledOperand(); + if (VariadicWrapper == calledOperand) + Changed |= + expandCall(M, Builder, CB, VariadicWrapper->getFunctionType(), + FixedArityReplacement); + } + } + + // The original function will be erased. + // One of the two new functions will become a replacement for the original. + // When preserving the ABI, the other is an internal implementation detail. + // When rewriting the ABI, RAUW then the variadic one. + Function *const ExternallyAccessible = + rewriteABI() ? FixedArityReplacement : VariadicWrapper; + Function *const InternalOnly = + rewriteABI() ? VariadicWrapper : FixedArityReplacement; + + // The external function is the replacement for the original + ExternallyAccessible->setLinkage(OriginalFunction->getLinkage()); + ExternallyAccessible->setVisibility(OriginalFunction->getVisibility()); + ExternallyAccessible->setComdat(OriginalFunction->getComdat()); + ExternallyAccessible->takeName(OriginalFunction); + + // Annotate the internal one as internal + InternalOnly->setVisibility(GlobalValue::DefaultVisibility); + InternalOnly->setLinkage(GlobalValue::InternalLinkage); + + // The original is unused and obsolete + OriginalFunction->eraseFromParent(); + + InternalOnly->removeDeadConstantUsers(); + + if (rewriteABI()) { + // All known calls to the function have been removed by expandCall + // Resolve everything else by replaceAllUsesWith + VariadicWrapper->replaceAllUsesWith(FixedArityReplacement); + VariadicWrapper->eraseFromParent(); + } + + return Changed; +} + +Function * +ExpandVariadics::replaceAllUsesWithNewDeclaration(Module &M, + Function *OriginalFunction) { + auto &Ctx = M.getContext(); + Function &F = *OriginalFunction; + FunctionType *FTy = F.getFunctionType(); + Function *NF = Function::Create(FTy, F.getLinkage(), F.getAddressSpace()); + + NF->setName(F.getName() + ".varargs"); + NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat; + + F.getParent()->getFunctionList().insert(F.getIterator(), NF); + + AttrBuilder ParamAttrs(Ctx); + AttributeList Attrs = NF->getAttributes(); + Attrs = Attrs.addParamAttributes(Ctx, FTy->getNumParams(), ParamAttrs); + NF->setAttributes(Attrs); + + OriginalFunction->replaceAllUsesWith(NF); + return NF; +} + +Function * +ExpandVariadics::deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder, + Function *OriginalFunction) { + Function &F = *OriginalFunction; + // The purpose here is split the variadic function F into two functions + // One is a variadic function that bundles the passed argument into a va_list + // and passes it to the second function. The second function does whatever + // the original F does, except that it takes a va_list instead of the ... + + assert(expansionApplicableToFunction(M, &F)); + + auto &Ctx = M.getContext(); + + // Returned value isDeclaration() is equal to F.isDeclaration() + // but that property is not invariant throughout this function + const bool FunctionIsDefinition = !F.isDeclaration(); + + FunctionType *FTy = F.getFunctionType(); + SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end()); + ArgTypes.push_back(ABI->vaListParameterType(M)); + + FunctionType *NFTy = inlinableVariadicFunctionType(M, FTy); + Function *NF = Function::Create(NFTy, F.getLinkage(), F.getAddressSpace()); + + // Note - same attribute handling as DeadArgumentElimination + NF->copyAttributesFrom(&F); + NF->setComdat(F.getComdat()); + F.getParent()->getFunctionList().insert(F.getIterator(), NF); + NF->setName(F.getName() + ".valist"); + NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat; + + AttrBuilder ParamAttrs(Ctx); + + AttributeList Attrs = NF->getAttributes(); + Attrs = Attrs.addParamAttributes(Ctx, NFTy->getNumParams() - 1, ParamAttrs); + NF->setAttributes(Attrs); + + // Splice the implementation into the new function with minimal changes + if (FunctionIsDefinition) { + NF->splice(NF->begin(), &F); + + auto NewArg = NF->arg_begin(); + for (Argument &Arg : F.args()) { + Arg.replaceAllUsesWith(NewArg); + NewArg->setName(Arg.getName()); // takeName without killing the old one + ++NewArg; + } + NewArg->setName("varargs"); + } + + SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; + F.getAllMetadata(MDs); + for (auto [KindID, Node] : MDs) + NF->addMetadata(KindID, *Node); + F.clearMetadata(); + + return NF; +} + +Function * +ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder, + Function *VariadicWrapper, + Function *FixedArityReplacement) { + auto &Ctx = Builder.getContext(); + const DataLayout &DL = M.getDataLayout(); + assert(VariadicWrapper->isDeclaration()); + Function &F = *VariadicWrapper; + + assert(F.isDeclaration()); + Type *VaListTy = ABI->vaListType(Ctx); + + auto *BB = BasicBlock::Create(Ctx, "entry", &F); + Builder.SetInsertPoint(BB); + + AllocaInst *VaListInstance = + Builder.CreateAlloca(VaListTy, nullptr, "va_start"); + + Builder.CreateLifetimeStart(VaListInstance, + sizeOfAlloca(Ctx, DL, VaListInstance)); + + Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)}, + {VaListInstance}); + + SmallVector<Value *> Args; + for (Argument &A : F.args()) + Args.push_back(&A); + + Type *ParameterType = ABI->vaListParameterType(M); + if (ABI->vaListPassedInSSARegister()) + Args.push_back(Builder.CreateLoad(ParameterType, VaListInstance)); + else + Args.push_back(Builder.CreateAddrSpaceCast(VaListInstance, ParameterType)); + + CallInst *Result = Builder.CreateCall(FixedArityReplacement, Args); + + Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)}, + {VaListInstance}); + Builder.CreateLifetimeEnd(VaListInstance, + sizeOfAlloca(Ctx, DL, VaListInstance)); + + if (Result->getType()->isVoidTy()) + Builder.CreateRetVoid(); + else + Builder.CreateRet(Result); + + return VariadicWrapper; +} + +bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, + FunctionType *VarargFunctionType, + Function *NF) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + + if (!expansionApplicableToFunctionCall(CB)) { + if (rewriteABI()) + report_fatal_error("Cannot lower callbase instruction"); + return Changed; + } + + // This is tricky. The call instruction's function type might not match + // the type of the caller. When optimising, can leave it unchanged. + // Webassembly detects that inconsistency and repairs it. + FunctionType *FuncType = CB->getFunctionType(); + if (FuncType != VarargFunctionType) { + if (!rewriteABI()) + return Changed; + FuncType = VarargFunctionType; + } + + auto &Ctx = CB->getContext(); + + Align MaxFieldAlign(1); + + // The strategy is to allocate a call frame containing the variadic + // arguments laid out such that a target specific va_list can be initialized + // with it, such that target specific va_arg instructions will correctly + // iterate over it. This means getting the alignment right and sometimes + // embedding a pointer to the value instead of embedding the value itself. + + Function *CBF = CB->getParent()->getParent(); + + ExpandedCallFrame Frame; + + uint64_t CurrentOffset = 0; + + for (unsigned I = FuncType->getNumParams(), E = CB->arg_size(); I < E; ++I) { + Value *ArgVal = CB->getArgOperand(I); + const bool IsByVal = CB->paramHasAttr(I, Attribute::ByVal); + const bool IsByRef = CB->paramHasAttr(I, Attribute::ByRef); + + // The type of the value being passed, decoded from byval/byref metadata if + // required + Type *const UnderlyingType = IsByVal ? CB->getParamByValType(I) + : IsByRef ? CB->getParamByRefType(I) + : ArgVal->getType(); + const uint64_t UnderlyingSize = + DL.getTypeAllocSize(UnderlyingType).getFixedValue(); + + // The type to be written into the call frame + Type *FrameFieldType = UnderlyingType; + + // The value to copy from when initialising the frame alloca + Value *SourceValue = ArgVal; + + VariadicABIInfo::VAArgSlotInfo SlotInfo = ABI->slotInfo(DL, UnderlyingType); + + if (SlotInfo.Indirect) { + // The va_arg lowering loads through a pointer. Set up an alloca to aim + // that pointer at. + Builder.SetInsertPointPastAllocas(CBF); + Builder.SetCurrentDebugLocation(CB->getStableDebugLoc()); + Value *CallerCopy = + Builder.CreateAlloca(UnderlyingType, nullptr, "IndirectAlloca"); + + Builder.SetInsertPoint(CB); + if (IsByVal) + Builder.CreateMemCpy(CallerCopy, {}, ArgVal, {}, UnderlyingSize); + else + Builder.CreateStore(ArgVal, CallerCopy); + + // Indirection now handled, pass the alloca ptr by value + FrameFieldType = DL.getAllocaPtrType(Ctx); + SourceValue = CallerCopy; + } + + // Alignment of the value within the frame + // This probably needs to be controllable as a function of type + Align DataAlign = SlotInfo.DataAlign; + + MaxFieldAlign = std::max(MaxFieldAlign, DataAlign); + + uint64_t DataAlignV = DataAlign.value(); + if (uint64_t Rem = CurrentOffset % DataAlignV) { + // Inject explicit padding to deal with alignment requirements + uint64_t Padding = DataAlignV - Rem; + Frame.padding(Ctx, Padding); + CurrentOffset += Padding; + } + + if (SlotInfo.Indirect) { + Frame.store(Ctx, FrameFieldType, SourceValue); + } else { + if (IsByVal) + Frame.memcpy(Ctx, FrameFieldType, SourceValue, UnderlyingSize); + else + Frame.store(Ctx, FrameFieldType, SourceValue); + } + + CurrentOffset += DL.getTypeAllocSize(FrameFieldType).getFixedValue(); + } + + if (Frame.empty()) { + // Not passing any arguments, hopefully va_arg won't try to read any + // Creating a single byte frame containing nothing to point the va_list + // instance as that is less special-casey in the compiler and probably + // easier to interpret in a debugger. + Frame.padding(Ctx, 1); + } + + StructType *VarargsTy = Frame.asStruct(Ctx, CBF->getName()); + + // The struct instance needs to be at least MaxFieldAlign for the alignment of + // the fields to be correct at runtime. Use the native stack alignment instead + // if that's greater as that tends to give better codegen. + // This is an awkward way to guess whether there is a known stack alignment + // without hitting an assert in DL.getStackAlignment, 1024 is an arbitrary + // number likely to be greater than the natural stack alignment. + // TODO: DL.getStackAlignment could return a MaybeAlign instead of assert + Align AllocaAlign = MaxFieldAlign; + if (DL.exceedsNaturalStackAlignment(Align(1024))) + AllocaAlign = std::max(AllocaAlign, DL.getStackAlignment()); + + // Put the alloca to hold the variadic args in the entry basic block. + Builder.SetInsertPointPastAllocas(CBF); + + // SetCurrentDebugLocation when the builder SetInsertPoint method does not + Builder.SetCurrentDebugLocation(CB->getStableDebugLoc()); + + // The awkward construction here is to set the alignment on the instance + AllocaInst *Alloced = Builder.Insert( + new AllocaInst(VarargsTy, DL.getAllocaAddrSpace(), nullptr, AllocaAlign), + "vararg_buffer"); + Changed = true; + assert(Alloced->getAllocatedType() == VarargsTy); + + // Initialize the fields in the struct + Builder.SetInsertPoint(CB); + Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced)); + Frame.initializeStructAlloca(DL, Builder, Alloced); + + const unsigned NumArgs = FuncType->getNumParams(); + SmallVector<Value *> Args(CB->arg_begin(), CB->arg_begin() + NumArgs); + + // Initialize a va_list pointing to that struct and pass it as the last + // argument + AllocaInst *VaList = nullptr; + { + if (!ABI->vaListPassedInSSARegister()) { + Type *VaListTy = ABI->vaListType(Ctx); + Builder.SetInsertPointPastAllocas(CBF); + Builder.SetCurrentDebugLocation(CB->getStableDebugLoc()); + VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument"); + Builder.SetInsertPoint(CB); + Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList)); + } + Builder.SetInsertPoint(CB); + Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced)); + } + + // Attributes excluding any on the vararg arguments + AttributeList PAL = CB->getAttributes(); + if (!PAL.isEmpty()) { + SmallVector<AttributeSet, 8> ArgAttrs; + for (unsigned ArgNo = 0; ArgNo < NumArgs; ArgNo++) + ArgAttrs.push_back(PAL.getParamAttrs(ArgNo)); + PAL = + AttributeList::get(Ctx, PAL.getFnAttrs(), PAL.getRetAttrs(), ArgAttrs); + } + + SmallVector<OperandBundleDef, 1> OpBundles; + CB->getOperandBundlesAsDefs(OpBundles); + + CallBase *NewCB = nullptr; + + if (CallInst *CI = dyn_cast<CallInst>(CB)) { + Value *Dst = NF ? NF : CI->getCalledOperand(); + FunctionType *NFTy = inlinableVariadicFunctionType(M, VarargFunctionType); + + NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI); + + CallInst::TailCallKind TCK = CI->getTailCallKind(); + assert(TCK != CallInst::TCK_MustTail); + + // Can't tail call a function that is being passed a pointer to an alloca + if (TCK == CallInst::TCK_Tail) + TCK = CallInst::TCK_None; + CI->setTailCallKind(TCK); + + } else { + llvm_unreachable("Unreachable when !expansionApplicableToFunctionCall()"); + } + + if (VaList) + Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList)); + + Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced)); + + NewCB->setAttributes(PAL); + NewCB->takeName(CB); + NewCB->setCallingConv(CB->getCallingConv()); + NewCB->setDebugLoc(DebugLoc()); + + // DeadArgElim and ArgPromotion copy exactly this metadata + NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg}); + + CB->replaceAllUsesWith(NewCB); + CB->eraseFromParent(); + return Changed; +} + +bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder, + const DataLayout &DL, + VAStartInst *Inst) { + // Only removing va_start instructions that are not in variadic functions. + // Those would be rejected by the IR verifier before this pass. + // After splicing basic blocks from a variadic function into a fixed arity + // one the va_start that used to refer to the ... parameter still exist. + // There are also variadic functions that this pass did not change and + // va_start instances in the created single block wrapper functions. + // Replace exactly the instances in non-variadic functions as those are + // the ones to be fixed up to use the va_list passed as the final argument. + + Function *ContainingFunction = Inst->getFunction(); + if (ContainingFunction->isVarArg()) { + return false; + } + + // The last argument is a vaListParameterType, either a va_list + // or a pointer to one depending on the target. + bool PassedByValue = ABI->vaListPassedInSSARegister(); + Argument *PassedVaList = + ContainingFunction->getArg(ContainingFunction->arg_size() - 1); + + // va_start takes a pointer to a va_list, e.g. one on the stack + Value *VaStartArg = Inst->getArgList(); + + Builder.SetInsertPoint(Inst); + + if (PassedByValue) { + // The general thing to do is create an alloca, store the va_list argument + // to it, then create a va_copy. When vaCopyIsMemcpy(), this optimises to a + // store to the VaStartArg. + assert(ABI->vaCopyIsMemcpy()); + Builder.CreateStore(PassedVaList, VaStartArg); + } else { + + // Otherwise emit a vacopy to pick up target-specific handling if any + auto &Ctx = Builder.getContext(); + + Builder.CreateIntrinsic(Intrinsic::vacopy, {DL.getAllocaPtrType(Ctx)}, + {VaStartArg, PassedVaList}); + } + + Inst->eraseFromParent(); + return true; +} + +bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &, + VAEndInst *Inst) { + assert(ABI->vaEndIsNop()); + Inst->eraseFromParent(); + return true; +} + +bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder, + const DataLayout &DL, + VACopyInst *Inst) { + assert(ABI->vaCopyIsMemcpy()); + Builder.SetInsertPoint(Inst); + + auto &Ctx = Builder.getContext(); + Type *VaListTy = ABI->vaListType(Ctx); + uint64_t Size = DL.getTypeAllocSize(VaListTy).getFixedValue(); + + Builder.CreateMemCpy(Inst->getDest(), {}, Inst->getSrc(), {}, + Builder.getInt32(Size)); + + Inst->eraseFromParent(); + return true; +} + +struct Amdgpu final : public VariadicABIInfo { + + bool enableForTarget() override { return true; } + + bool vaListPassedInSSARegister() override { return true; } + + Type *vaListType(LLVMContext &Ctx) override { + return PointerType::getUnqual(Ctx); + } + + Type *vaListParameterType(Module &M) override { + return PointerType::getUnqual(M.getContext()); + } + + Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder, + AllocaInst * /*va_list*/, Value *Buffer) override { + // Given Buffer, which is an AllocInst of vararg_buffer + // need to return something usable as parameter type + return Builder.CreateAddrSpaceCast(Buffer, vaListParameterType(M)); + } + + VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override { + return {Align(4), false}; + } +}; + +struct Wasm final : public VariadicABIInfo { + + bool enableForTarget() override { + // Currently wasm is only used for testing. + return commandLineOverride(); + } + + bool vaListPassedInSSARegister() override { return true; } + + Type *vaListType(LLVMContext &Ctx) override { + return PointerType::getUnqual(Ctx); + } + + Type *vaListParameterType(Module &M) override { + return PointerType::getUnqual(M.getContext()); + } + + Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder, + AllocaInst * /*va_list*/, Value *Buffer) override { + return Buffer; + } + + VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override { + LLVMContext &Ctx = Parameter->getContext(); + const unsigned MinAlign = 4; + Align A = DL.getABITypeAlign(Parameter); + if (A < MinAlign) + A = Align(MinAlign); + + if (auto s = dyn_cast<StructType>(Parameter)) { + if (s->getNumElements() > 1) { + return {DL.getABITypeAlign(PointerType::getUnqual(Ctx)), true}; + } + } + + return {A, false}; + } +}; + +std::unique_ptr<VariadicABIInfo> VariadicABIInfo::create(const Triple &T) { + switch (T.getArch()) { + case Triple::r600: + case Triple::amdgcn: { + return std::make_unique<Amdgpu>(); + } + + case Triple::wasm32: { + return std::make_unique<Wasm>(); + } + + default: + return {}; + } +} + +} // namespace + +char ExpandVariadics::ID = 0; + +INITIALIZE_PASS(ExpandVariadics, DEBUG_TYPE, "Expand variadic functions", false, + false) + +ModulePass *llvm::createExpandVariadicsPass(ExpandVariadicsMode M) { + return new ExpandVariadics(M); +} + +PreservedAnalyses ExpandVariadicsPass::run(Module &M, ModuleAnalysisManager &) { + return ExpandVariadics(Mode).runOnModule(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} + +ExpandVariadicsPass::ExpandVariadicsPass(ExpandVariadicsMode M) : Mode(M) {} diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 03923b83cf34..f033d2b0d6d0 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -262,8 +262,70 @@ public: // TODO: Should this be a map (from Caller node) for more efficient lookup? std::vector<std::shared_ptr<ContextEdge>> CallerEdges; - // The set of IDs for contexts including this node. - DenseSet<uint32_t> ContextIds; + // Get the list of edges from which we can compute allocation information + // such as the context ids and allocation type of this node. + const std::vector<std::shared_ptr<ContextEdge>> * + getEdgesWithAllocInfo() const { + // If node has any callees, compute from those, otherwise compute from + // callers (i.e. if this is the leaf allocation node). + if (!CalleeEdges.empty()) + return &CalleeEdges; + if (!CallerEdges.empty()) { + // A node with caller edges but no callee edges must be the allocation + // node. + assert(IsAllocation); + return &CallerEdges; + } + return nullptr; + } + + // Compute the context ids for this node from the union of its edge context + // ids. + DenseSet<uint32_t> getContextIds() const { + DenseSet<uint32_t> ContextIds; + auto *Edges = getEdgesWithAllocInfo(); + if (!Edges) + return {}; + unsigned Count = 0; + for (auto &Edge : *Edges) + Count += Edge->getContextIds().size(); + ContextIds.reserve(Count); + for (auto &Edge : *Edges) + ContextIds.insert(Edge->getContextIds().begin(), + Edge->getContextIds().end()); + return ContextIds; + } + + // Compute the allocation type for this node from the OR of its edge + // allocation types. + uint8_t computeAllocType() const { + auto *Edges = getEdgesWithAllocInfo(); + if (!Edges) + return (uint8_t)AllocationType::None; + uint8_t BothTypes = + (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; + uint8_t AllocType = (uint8_t)AllocationType::None; + for (auto &Edge : *Edges) { + AllocType |= Edge->AllocTypes; + // Bail early if alloc type reached both, no further refinement. + if (AllocType == BothTypes) + return AllocType; + } + return AllocType; + } + + // The context ids set for this node is empty if its edge context ids are + // also all empty. + bool emptyContextIds() const { + auto *Edges = getEdgesWithAllocInfo(); + if (!Edges) + return true; + for (auto &Edge : *Edges) { + if (!Edge->getContextIds().empty()) + return false; + } + return true; + } // List of clones of this ContextNode, initially empty. std::vector<ContextNode *> Clones; @@ -308,15 +370,11 @@ public: void printCall(raw_ostream &OS) const { Call.print(OS); } // True if this node was effectively removed from the graph, in which case - // its context id set, caller edges, and callee edges should all be empty. + // it should have an allocation type of None and empty context ids. bool isRemoved() const { - // Note that we can have non-empty context ids with empty caller and - // callee edges if the graph ends up with a single node. - if (ContextIds.empty()) - assert(CalleeEdges.empty() && CallerEdges.empty() && - "Context ids empty but at least one of callee and caller edges " - "were not!"); - return ContextIds.empty(); + assert((AllocTypes == (uint8_t)AllocationType::None) == + emptyContextIds()); + return AllocTypes == (uint8_t)AllocationType::None; } void dump() const; @@ -429,7 +487,8 @@ private: /// else to its callers. Also updates OrigNode's edges to remove any context /// ids moved to the newly created edge. void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, - bool TowardsCallee); + bool TowardsCallee, + DenseSet<uint32_t> RemainingContextIds); /// Get the stack id corresponding to the given Id or Index (for IR this will /// return itself, for a summary index this will return the id recorded in the @@ -958,7 +1017,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( // Update alloc type and context ids for this MIB. AllocNode->AllocTypes |= (uint8_t)AllocType; - AllocNode->ContextIds.insert(LastContextId); // Now add or update nodes for each stack id in alloc's context. // Later when processing the stack ids on non-alloc callsites we will adjust @@ -983,7 +1041,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( auto Ins = StackIdSet.insert(StackId); if (!Ins.second) StackNode->Recursive = true; - StackNode->ContextIds.insert(LastContextId); StackNode->AllocTypes |= (uint8_t)AllocType; PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); PrevNode = StackNode; @@ -1034,7 +1091,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: // it resulted in any added ids to NextNode. if (!NewIdsToAdd.empty()) { Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); - NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); UpdateCallers(NextNode, Visited, UpdateCallers); } } @@ -1043,21 +1099,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: DenseSet<const ContextEdge *> Visited; for (auto &Entry : AllocationCallToContextNodeMap) { auto *Node = Entry.second; - // Update ids on the allocation nodes before calling the recursive - // update along caller edges, since this simplifies the logic during - // that traversal. - DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds); - Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); UpdateCallers(Node, Visited, UpdateCallers); } } template <typename DerivedCCG, typename FuncTy, typename CallTy> void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( - ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) { - // Make a copy of the context ids, since this will be adjusted below as they - // are moved. - DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds; + ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee, + // This must be passed by value to make a copy since it will be adjusted + // as ids are moved. + DenseSet<uint32_t> RemainingContextIds) { auto &OrigEdges = TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; // Increment iterator in loop so that we can remove edges as needed. @@ -1104,6 +1155,51 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( } template <typename DerivedCCG, typename FuncTy, typename CallTy> +static void checkEdge( + const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { + // Confirm that alloc type is not None and that we have at least one context + // id. + assert(Edge->AllocTypes != (uint8_t)AllocationType::None); + assert(!Edge->ContextIds.empty()); +} + +template <typename DerivedCCG, typename FuncTy, typename CallTy> +static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, + bool CheckEdges = true) { + if (Node->isRemoved()) + return; +#ifndef NDEBUG + // Compute node's context ids once for use in asserts. + auto NodeContextIds = Node->getContextIds(); +#endif + // Node's context ids should be the union of both its callee and caller edge + // context ids. + if (Node->CallerEdges.size()) { + DenseSet<uint32_t> CallerEdgeContextIds( + Node->CallerEdges.front()->ContextIds); + for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) { + if (CheckEdges) + checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); + set_union(CallerEdgeContextIds, Edge->ContextIds); + } + // Node can have more context ids than callers if some contexts terminate at + // node and some are longer. + assert(NodeContextIds == CallerEdgeContextIds || + set_is_subset(CallerEdgeContextIds, NodeContextIds)); + } + if (Node->CalleeEdges.size()) { + DenseSet<uint32_t> CalleeEdgeContextIds( + Node->CalleeEdges.front()->ContextIds); + for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) { + if (CheckEdges) + checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); + set_union(CalleeEdgeContextIds, Edge->getContextIds()); + } + assert(NodeContextIds == CalleeEdgeContextIds); + } +} + +template <typename DerivedCCG, typename FuncTy, typename CallTy> void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: assignStackNodesPostOrder(ContextNode *Node, DenseSet<const ContextNode *> &Visited, @@ -1178,7 +1274,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: // duplicated context ids. We have to recompute as we might have overlap // overlap between the saved context ids for different last nodes, and // removed them already during the post order traversal. - set_intersect(SavedContextIds, FirstNode->ContextIds); + set_intersect(SavedContextIds, FirstNode->getContextIds()); ContextNode *PrevNode = nullptr; for (auto Id : Ids) { ContextNode *CurNode = getNodeForStackId(Id); @@ -1211,18 +1307,17 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: ContextNode *NewNode = NodeOwner.back().get(); NodeToCallingFunc[NewNode] = Func; NonAllocationCallToContextNodeMap[Call] = NewNode; - NewNode->ContextIds = SavedContextIds; - NewNode->AllocTypes = computeAllocType(NewNode->ContextIds); + NewNode->AllocTypes = computeAllocType(SavedContextIds); // Connect to callees of innermost stack frame in inlined call chain. // This updates context ids for FirstNode's callee's to reflect those // moved to NewNode. - connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true); + connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds); // Connect to callers of outermost stack frame in inlined call chain. // This updates context ids for FirstNode's caller's to reflect those // moved to NewNode. - connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false); + connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds); // Now we need to remove context ids from edges/nodes between First and // Last Node. @@ -1234,18 +1329,32 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: // Remove the context ids moved to NewNode from CurNode, and the // edge from the prior node. - set_subtract(CurNode->ContextIds, NewNode->ContextIds); if (PrevNode) { auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); assert(PrevEdge); - set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds); + set_subtract(PrevEdge->getContextIds(), SavedContextIds); if (PrevEdge->getContextIds().empty()) { PrevNode->eraseCallerEdge(PrevEdge); CurNode->eraseCalleeEdge(PrevEdge); } } + // Since we update the edges from leaf to tail, only look at the callee + // edges. This isn't an alloc node, so if there are no callee edges, the + // alloc type is None. + CurNode->AllocTypes = CurNode->CalleeEdges.empty() + ? (uint8_t)AllocationType::None + : CurNode->computeAllocType(); PrevNode = CurNode; } + if (VerifyNodes) { + checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true); + for (auto Id : Ids) { + ContextNode *CurNode = getNodeForStackId(Id); + // We should only have kept stack ids that had nodes. + assert(CurNode); + checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true); + } + } } } @@ -1319,7 +1428,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { // Initialize the context ids with the last node's. We will subsequently // refine the context ids by computing the intersection along all edges. - DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds; + DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); assert(!LastNodeContextIds.empty()); for (unsigned I = 0; I < Calls.size(); I++) { @@ -1442,6 +1551,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { DenseSet<const ContextNode *> Visited; for (auto &Entry : AllocationCallToContextNodeMap) assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls); + if (VerifyCCG) + check(); } uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { @@ -1786,8 +1897,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( // First check if we have already synthesized a node for this tail call. if (TailCallToContextNodeMap.count(NewCall)) { NewNode = TailCallToContextNodeMap[NewCall]; - NewNode->ContextIds.insert(Edge->ContextIds.begin(), - Edge->ContextIds.end()); NewNode->AllocTypes |= Edge->AllocTypes; } else { FuncToCallsWithMetadata[Func].push_back({NewCall}); @@ -1797,7 +1906,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( NewNode = NodeOwner.back().get(); NodeToCallingFunc[NewNode] = Func; TailCallToContextNodeMap[NewCall] = NewNode; - NewNode->ContextIds = Edge->ContextIds; NewNode->AllocTypes = Edge->AllocTypes; } @@ -2091,6 +2199,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( OS << "\n"; OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; OS << "\tContextIds:"; + // Make a copy of the computed context ids that we can sort for stability. + auto ContextIds = getContextIds(); std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); std::sort(SortedIds.begin(), SortedIds.end()); for (auto Id : SortedIds) @@ -2151,53 +2261,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( } template <typename DerivedCCG, typename FuncTy, typename CallTy> -static void checkEdge( - const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { - // Confirm that alloc type is not None and that we have at least one context - // id. - assert(Edge->AllocTypes != (uint8_t)AllocationType::None); - assert(!Edge->ContextIds.empty()); -} - -template <typename DerivedCCG, typename FuncTy, typename CallTy> -static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, - bool CheckEdges = true) { - if (Node->isRemoved()) - return; - // Node's context ids should be the union of both its callee and caller edge - // context ids. - if (Node->CallerEdges.size()) { - auto EI = Node->CallerEdges.begin(); - auto &FirstEdge = *EI; - EI++; - DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds); - for (; EI != Node->CallerEdges.end(); EI++) { - const auto &Edge = *EI; - if (CheckEdges) - checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); - set_union(CallerEdgeContextIds, Edge->ContextIds); - } - // Node can have more context ids than callers if some contexts terminate at - // node and some are longer. - assert(Node->ContextIds == CallerEdgeContextIds || - set_is_subset(CallerEdgeContextIds, Node->ContextIds)); - } - if (Node->CalleeEdges.size()) { - auto EI = Node->CalleeEdges.begin(); - auto &FirstEdge = *EI; - EI++; - DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds); - for (; EI != Node->CalleeEdges.end(); EI++) { - const auto &Edge = *EI; - if (CheckEdges) - checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); - set_union(CalleeEdgeContextIds, Edge->ContextIds); - } - assert(Node->ContextIds == CalleeEdgeContextIds); - } -} - -template <typename DerivedCCG, typename FuncTy, typename CallTy> void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; for (const auto Node : nodes<GraphType>(this)) { @@ -2284,7 +2347,7 @@ struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> static std::string getNodeAttributes(NodeRef Node, GraphType) { std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " + - getContextIds(Node->ContextIds) + "\"") + getContextIds(Node->getContextIds()) + "\"") .str(); AttributeString += (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str(); @@ -2443,16 +2506,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: set_subtract(Edge->ContextIds, ContextIdsToMove); Edge->AllocTypes = computeAllocType(Edge->ContextIds); } - // Now perform some updates that are common to all cases: the NewCallee gets - // the moved ids added, and we need to remove those ids from OldCallee and - // update its alloc type (NewCallee alloc type updates handled above). - NewCallee->ContextIds.insert(ContextIdsToMove.begin(), - ContextIdsToMove.end()); - set_subtract(OldCallee->ContextIds, ContextIdsToMove); - OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds); - // OldCallee alloc type should be None iff its context id set is now empty. - assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == - OldCallee->ContextIds.empty()); // Now walk the old callee node's callee edges and move Edge's context ids // over to the corresponding edge into the clone (which is created here if // this is a newly created clone). @@ -2484,6 +2537,12 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: NewCallee->CalleeEdges.push_back(NewEdge); NewEdge->Callee->CallerEdges.push_back(NewEdge); } + // Recompute the node alloc type now that its callee edges have been + // updated (since we will compute from those edges). + OldCallee->AllocTypes = OldCallee->computeAllocType(); + // OldCallee alloc type should be None iff its context id set is now empty. + assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == + OldCallee->emptyContextIds()); if (VerifyCCG) { checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); @@ -2528,7 +2587,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { DenseSet<const ContextNode *> Visited; for (auto &Entry : AllocationCallToContextNodeMap) { Visited.clear(); - identifyClones(Entry.second, Visited, Entry.second->ContextIds); + identifyClones(Entry.second, Visited, Entry.second->getContextIds()); } Visited.clear(); for (auto &Entry : AllocationCallToContextNodeMap) @@ -2714,7 +2773,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( } // We should still have some context ids on the original Node. - assert(!Node->ContextIds.empty()); + assert(!Node->emptyContextIds()); // Sanity check that no alloc types on node or edges are None. assert(Node->AllocTypes != (uint8_t)AllocationType::None); @@ -2918,7 +2977,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // find additional cloning is required. std::deque<ContextNode *> ClonesWorklist; // Ignore original Node if we moved all of its contexts to clones. - if (!Node->ContextIds.empty()) + if (!Node->emptyContextIds()) ClonesWorklist.push_back(Node); ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), Node->Clones.end()); @@ -3258,7 +3317,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // Skip if either no call to update, or if we ended up with no context ids // (we moved all edges onto other clones). - if (!Node->hasCall() || Node->ContextIds.empty()) + if (!Node->hasCall() || Node->emptyContextIds()) return; if (Node->IsAllocation) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 89193f8ff94b..38c1c2644554 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -4745,6 +4745,29 @@ static Instruction *foldICmpAndXX(ICmpInst &I, const SimplifyQuery &Q, Constant::getNullValue(Op1->getType())); } + if (!ICmpInst::isSigned(Pred)) + return nullptr; + + KnownBits KnownY = IC.computeKnownBits(A, /*Depth=*/0, &I); + // (X & NegY) spred X --> (X & NegY) upred X + if (KnownY.isNegative()) + return new ICmpInst(ICmpInst::getUnsignedPredicate(Pred), Op0, Op1); + + if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGT) + return nullptr; + + if (KnownY.isNonNegative()) + // (X & PosY) s<= X --> X s>= 0 + // (X & PosY) s> X --> X s< 0 + return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Op1, + Constant::getNullValue(Op1->getType())); + + if (isKnownNegative(Op1, IC.getSimplifyQuery().getWithInstruction(&I))) + // (NegX & Y) s<= NegX --> Y s< 0 + // (NegX & Y) s> NegX --> Y s>= 0 + return new ICmpInst(ICmpInst::getFlippedStrictnessPredicate(Pred), A, + Constant::getNullValue(A->getType())); + return nullptr; } @@ -4772,7 +4795,7 @@ static Instruction *foldICmpOrXX(ICmpInst &I, const SimplifyQuery &Q, if (ICmpInst::isEquality(Pred) && Op0->hasOneUse()) { // icmp (X | Y) eq/ne Y --> (X & ~Y) eq/ne 0 if Y is freely invertible if (Value *NotOp1 = - IC.getFreelyInverted(Op1, Op1->hasOneUse(), &IC.Builder)) + IC.getFreelyInverted(Op1, !Op1->hasNUsesOrMore(3), &IC.Builder)) return new ICmpInst(Pred, IC.Builder.CreateAnd(A, NotOp1), Constant::getNullValue(Op1->getType())); // icmp (X | Y) eq/ne Y --> (~X | Y) eq/ne -1 if X is freely invertible. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 313beb7b6407..d2aaa5e23054 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1294,8 +1294,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel, // X == Y ? X : Z with X == Y ? Y : Z, as that would lead to an infinite // replacement cycle. Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1); - if (TrueVal != CmpLHS && - isGuaranteedNotToBeUndefOrPoison(CmpRHS, SQ.AC, &Sel, &DT)) { + if (TrueVal != CmpLHS && isGuaranteedNotToBeUndef(CmpRHS, SQ.AC, &Sel, &DT)) { if (Value *V = simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ, /* AllowRefinement */ true)) // Require either the replacement or the simplification result to be a @@ -1316,8 +1315,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel, if (replaceInInstruction(TrueVal, CmpLHS, CmpRHS)) return &Sel; } - if (TrueVal != CmpRHS && - isGuaranteedNotToBeUndefOrPoison(CmpLHS, SQ.AC, &Sel, &DT)) + if (TrueVal != CmpRHS && isGuaranteedNotToBeUndef(CmpLHS, SQ.AC, &Sel, &DT)) if (Value *V = simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ, /* AllowRefinement */ true)) if (isa<Constant>(CmpLHS) || isa<Constant>(V)) diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 2aa21759d56e..a0e63bf12400 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -337,13 +337,17 @@ private: unsigned AccessSizeIndex, Instruction *InsertBefore, DomTreeUpdater &DTU, LoopInfo *LI); - bool ignoreMemIntrinsic(MemIntrinsic *MI); + bool ignoreMemIntrinsic(OptimizationRemarkEmitter &ORE, MemIntrinsic *MI); void instrumentMemIntrinsic(MemIntrinsic *MI); bool instrumentMemAccess(InterestingMemoryOperand &O, DomTreeUpdater &DTU, LoopInfo *LI); - bool ignoreAccess(Instruction *Inst, Value *Ptr); + bool ignoreAccessWithoutRemark(Instruction *Inst, Value *Ptr); + bool ignoreAccess(OptimizationRemarkEmitter &ORE, Instruction *Inst, + Value *Ptr); + void getInterestingMemoryOperands( - Instruction *I, const TargetLibraryInfo &TLI, + OptimizationRemarkEmitter &ORE, Instruction *I, + const TargetLibraryInfo &TLI, SmallVectorImpl<InterestingMemoryOperand> &Interesting); void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size); @@ -765,7 +769,8 @@ Value *HWAddressSanitizer::getShadowNonTls(IRBuilder<> &IRB) { return IRB.CreateLoad(PtrTy, GlobalDynamicAddress); } -bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) { +bool HWAddressSanitizer::ignoreAccessWithoutRemark(Instruction *Inst, + Value *Ptr) { // Do not instrument accesses from different address spaces; we cannot deal // with them. Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType()); @@ -795,8 +800,23 @@ bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) { return false; } +bool HWAddressSanitizer::ignoreAccess(OptimizationRemarkEmitter &ORE, + Instruction *Inst, Value *Ptr) { + bool Ignored = ignoreAccessWithoutRemark(Inst, Ptr); + if (Ignored) { + ORE.emit( + [&]() { return OptimizationRemark(DEBUG_TYPE, "ignoreAccess", Inst); }); + } else { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ignoreAccess", Inst); + }); + } + return Ignored; +} + void HWAddressSanitizer::getInterestingMemoryOperands( - Instruction *I, const TargetLibraryInfo &TLI, + OptimizationRemarkEmitter &ORE, Instruction *I, + const TargetLibraryInfo &TLI, SmallVectorImpl<InterestingMemoryOperand> &Interesting) { // Skip memory accesses inserted by another instrumentation. if (I->hasMetadata(LLVMContext::MD_nosanitize)) @@ -807,22 +827,22 @@ void HWAddressSanitizer::getInterestingMemoryOperands( return; if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - if (!ClInstrumentReads || ignoreAccess(I, LI->getPointerOperand())) + if (!ClInstrumentReads || ignoreAccess(ORE, I, LI->getPointerOperand())) return; Interesting.emplace_back(I, LI->getPointerOperandIndex(), false, LI->getType(), LI->getAlign()); } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (!ClInstrumentWrites || ignoreAccess(I, SI->getPointerOperand())) + if (!ClInstrumentWrites || ignoreAccess(ORE, I, SI->getPointerOperand())) return; Interesting.emplace_back(I, SI->getPointerOperandIndex(), true, SI->getValueOperand()->getType(), SI->getAlign()); } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { - if (!ClInstrumentAtomics || ignoreAccess(I, RMW->getPointerOperand())) + if (!ClInstrumentAtomics || ignoreAccess(ORE, I, RMW->getPointerOperand())) return; Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true, RMW->getValOperand()->getType(), std::nullopt); } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { - if (!ClInstrumentAtomics || ignoreAccess(I, XCHG->getPointerOperand())) + if (!ClInstrumentAtomics || ignoreAccess(ORE, I, XCHG->getPointerOperand())) return; Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true, XCHG->getCompareOperand()->getType(), @@ -830,7 +850,7 @@ void HWAddressSanitizer::getInterestingMemoryOperands( } else if (auto *CI = dyn_cast<CallInst>(I)) { for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ArgNo++) { if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) || - ignoreAccess(I, CI->getArgOperand(ArgNo))) + ignoreAccess(ORE, I, CI->getArgOperand(ArgNo))) continue; Type *Ty = CI->getParamByValType(ArgNo); Interesting.emplace_back(I, ArgNo, false, Ty, Align(1)); @@ -1035,13 +1055,14 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite, ->setSuccessor(0, TCI.TagMismatchTerm->getParent()); } -bool HWAddressSanitizer::ignoreMemIntrinsic(MemIntrinsic *MI) { +bool HWAddressSanitizer::ignoreMemIntrinsic(OptimizationRemarkEmitter &ORE, + MemIntrinsic *MI) { if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { - return (!ClInstrumentWrites || ignoreAccess(MTI, MTI->getDest())) && - (!ClInstrumentReads || ignoreAccess(MTI, MTI->getSource())); + return (!ClInstrumentWrites || ignoreAccess(ORE, MTI, MTI->getDest())) && + (!ClInstrumentReads || ignoreAccess(ORE, MTI, MTI->getSource())); } if (isa<MemSetInst>(MI)) - return !ClInstrumentWrites || ignoreAccess(MI, MI->getDest()); + return !ClInstrumentWrites || ignoreAccess(ORE, MI, MI->getDest()); return false; } @@ -1541,6 +1562,9 @@ void HWAddressSanitizer::sanitizeFunction(Function &F, NumTotalFuncs++; + OptimizationRemarkEmitter &ORE = + FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + if (selectiveInstrumentationShouldSkip(F, FAM)) return; @@ -1562,10 +1586,10 @@ void HWAddressSanitizer::sanitizeFunction(Function &F, if (InstrumentLandingPads && isa<LandingPadInst>(Inst)) LandingPadVec.push_back(&Inst); - getInterestingMemoryOperands(&Inst, TLI, OperandsToInstrument); + getInterestingMemoryOperands(ORE, &Inst, TLI, OperandsToInstrument); if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) - if (!ignoreMemIntrinsic(MI)) + if (!ignoreMemIntrinsic(ORE, MI)) IntrinToInstrument.push_back(MI); } diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index ba2546b8db0e..4371b821eae6 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -827,7 +827,8 @@ private: return false; } - if (Metrics.convergent) { + // FIXME: Allow jump threading with controlled convergence. + if (Metrics.Convergence != ConvergenceKind::None) { LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " << "convergent instructions.\n"); ORE->emit([&]() { diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 7b4c54370e48..f8e2f1f28088 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -327,8 +327,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, UnrollCostEstimator OuterUCE(L, TTI, EphValues, UP.BEInsns); if (!InnerUCE.canUnroll() || !OuterUCE.canUnroll()) { - LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions" - << " which cannot be duplicated or have invalid cost.\n"); + LLVM_DEBUG(dbgs() << " Loop not considered unrollable\n"); return LoopUnrollResult::Unmodified; } @@ -341,7 +340,10 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return LoopUnrollResult::Unmodified; } - if (InnerUCE.Convergent || OuterUCE.Convergent) { + // FIXME: The call to canUnroll() allows some controlled convergent + // operations, but we block them here for future changes. + if (InnerUCE.Convergence != ConvergenceKind::None || + OuterUCE.Convergence != ConvergenceKind::None) { LLVM_DEBUG( dbgs() << " Not unrolling loop with convergent instructions.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 10fc9e9303e8..cbc35b6dd429 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -684,11 +684,15 @@ UnrollCostEstimator::UnrollCostEstimator( const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) { CodeMetrics Metrics; for (BasicBlock *BB : L->blocks()) - Metrics.analyzeBasicBlock(BB, TTI, EphValues); + Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false, + L); NumInlineCandidates = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; - Convergent = Metrics.convergent; + Convergence = Metrics.Convergence; LoopSize = Metrics.NumInsts; + ConvergenceAllowsRuntime = + Metrics.Convergence != ConvergenceKind::Uncontrolled && + !getLoopConvergenceHeart(L); // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's @@ -701,6 +705,25 @@ UnrollCostEstimator::UnrollCostEstimator( LoopSize = BEInsns + 1; } +bool UnrollCostEstimator::canUnroll() const { + switch (Convergence) { + case ConvergenceKind::ExtendedLoop: + LLVM_DEBUG(dbgs() << " Convergence prevents unrolling.\n"); + return false; + default: + break; + } + if (!LoopSize.isValid()) { + LLVM_DEBUG(dbgs() << " Invalid loop size prevents unrolling.\n"); + return false; + } + if (NotDuplicatable) { + LLVM_DEBUG(dbgs() << " Non-duplicatable blocks prevent unrolling.\n"); + return false; + } + return true; +} + uint64_t UnrollCostEstimator::getUnrolledLoopSize( const TargetTransformInfo::UnrollingPreferences &UP, unsigned CountOverwrite) const { @@ -1206,8 +1229,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns); if (!UCE.canUnroll()) { - LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions" - << " which cannot be duplicated or have invalid cost.\n"); + LLVM_DEBUG(dbgs() << " Loop not considered unrollable.\n"); return LoopUnrollResult::Unmodified; } @@ -1254,15 +1276,9 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, // is unsafe -- it adds a control-flow dependency to the convergent // operation. Therefore restrict remainder loop (try unrolling without). // - // TODO: This is quite conservative. In practice, convergent_op() - // is likely to be called unconditionally in the loop. In this - // case, the program would be ill-formed (on most architectures) - // unless n were the same on all threads in a thread group. - // Assuming n is the same on all threads, any kind of unrolling is - // safe. But currently llvm's notion of convergence isn't powerful - // enough to express this. - if (UCE.Convergent) - UP.AllowRemainder = false; + // TODO: This is somewhat conservative; we could allow the remainder if the + // trip count is uniform. + UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime; // Try to find the trip count upper bound if we cannot find the exact trip // count. @@ -1282,6 +1298,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, if (!UP.Count) return LoopUnrollResult::Unmodified; + UP.Runtime &= UCE.ConvergenceAllowsRuntime; + if (PP.PeelCount) { assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step"); LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName() @@ -1324,11 +1342,16 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, // Unroll the loop. Loop *RemainderLoop = nullptr; + UnrollLoopOptions ULO; + ULO.Count = UP.Count; + ULO.Force = UP.Force; + ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount; + ULO.UnrollRemainder = UP.UnrollRemainder; + ULO.Runtime = UP.Runtime; + ULO.ForgetAllSCEV = ForgetAllSCEV; + ULO.Heart = getLoopConvergenceHeart(L); LoopUnrollResult UnrollResult = UnrollLoop( - L, - {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, - UP.UnrollRemainder, ForgetAllSCEV}, - LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA); + L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA); if (UnrollResult == LoopUnrollResult::Unmodified) return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index eb471b259c7d..cfe63496a100 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1221,7 +1221,6 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, SmallPtrSet<const Value *, 4> ObjSet; SmallVector<Metadata *, 4> Scopes, NoAliases; - SmallSetVector<const Argument *, 4> NAPtrArgs; for (const Value *V : PtrArgs) { SmallVector<const Value *, 4> Objects; getUnderlyingObjects(V, Objects, /* LI = */ nullptr); diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 08ba65d9483e..3d950b151cd3 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -460,7 +460,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { L->dump()); return Rotated; } - if (Metrics.convergent) { + if (Metrics.Convergence != ConvergenceKind::None) { LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " "instructions: "; L->dump()); diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 1216538195fb..90d7b99e9d81 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -419,6 +419,26 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, } } +// Loops containing convergent instructions that are uncontrolled or controlled +// from outside the loop must have a count that divides their TripMultiple. +LLVM_ATTRIBUTE_USED +static bool canHaveUnrollRemainder(const Loop *L) { + if (getLoopConvergenceHeart(L)) + return false; + + // Check for uncontrolled convergent operations. + for (auto &BB : L->blocks()) { + for (auto &I : *BB) { + if (isa<ConvergenceControlInst>(I)) + return true; + if (auto *CB = dyn_cast<CallBase>(&I)) + if (CB->isConvergent()) + return CB->getConvergenceControlToken(); + } + } + return true; +} + /// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling /// can only fail when the loop's latch block is not terminated by a conditional /// branch instruction. However, if the trip count (and multiple) are not known, @@ -564,19 +584,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, return LoopUnrollResult::Unmodified; } - // Loops containing convergent instructions cannot use runtime unrolling, - // as the prologue/epilogue may add additional control-dependencies to - // convergent operations. - LLVM_DEBUG( - { - bool HasConvergent = false; - for (auto &BB : L->blocks()) - for (auto &I : *BB) - if (auto *CB = dyn_cast<CallBase>(&I)) - HasConvergent |= CB->isConvergent(); - assert((!HasConvergent || !ULO.Runtime) && - "Can't runtime unroll if loop contains a convergent operation."); - }); + assert((!ULO.Runtime || canHaveUnrollRemainder(L)) && + "Can't runtime unroll if loop contains a convergent operation."); bool EpilogProfitability = UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog @@ -722,7 +731,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, if (OldLoop) LoopsToSimplify.insert(NewLoops[OldLoop]); - if (*BB == Header) + if (*BB == Header) { // Loop over all of the PHI nodes in the block, changing them to use // the incoming values from the previous block. for (PHINode *OrigPHI : OrigPHINode) { @@ -735,6 +744,16 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, NewPHI->eraseFromParent(); } + // Eliminate copies of the loop heart intrinsic, if any. + if (ULO.Heart) { + auto it = VMap.find(ULO.Heart); + assert(it != VMap.end()); + Instruction *heartCopy = cast<Instruction>(it->second); + heartCopy->eraseFromParent(); + VMap.erase(it); + } + } + // Update our running map of newest clones LastValueMap[*BB] = New; for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index e1af02829c1d..dd7150bc63ec 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -1016,12 +1016,17 @@ bool llvm::UnrollRuntimeLoopRemainder( auto UnrollResult = LoopUnrollResult::Unmodified; if (remainderLoop && UnrollRemainder) { LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n"); - UnrollResult = - UnrollLoop(remainderLoop, - {/*Count*/ Count - 1, /*Force*/ false, /*Runtime*/ false, - /*AllowExpensiveTripCount*/ false, - /*UnrollRemainder*/ false, ForgetAllSCEV}, - LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA); + UnrollLoopOptions ULO; + ULO.Count = Count - 1; + ULO.Force = false; + ULO.Runtime = false; + ULO.AllowExpensiveTripCount = false; + ULO.UnrollRemainder = false; + ULO.ForgetAllSCEV = ForgetAllSCEV; + assert(!getLoopConvergenceHeart(L) && + "A loop with a convergence heart does not allow runtime unrolling."); + UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI, + /*ORE*/ nullptr, PreserveLCSSA); } if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 058746880743..d6b4acb2bdba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -274,6 +274,13 @@ m_Mul(const Op0_t &Op0, const Op1_t &Op1) { return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1); } +template <typename Op0_t, typename Op1_t> +inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul, + /* Commutative =*/true> +m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) { + return m_Binary<Instruction::Mul, Op0_t, Op1_t, true>(Op0, Op1); +} + /// Match a binary OR operation. Note that while conceptually the operands can /// be matched commutatively, \p Commutative defaults to false in line with the /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ab3b5cf2b9da..8ec67eb2f54b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1037,8 +1037,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)), - m_Mul(m_SpecificInt(1), m_VPValue(A))))) + if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return R.getVPSingleValue()->replaceAllUsesWith(A); } |
