100 files changed, 4032 insertions, 743 deletions
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
index 2637e2f97dbb..ea67b526423b 100644
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/InstructionCost.h"
 
@@ -111,11 +112,24 @@ void CodeMetrics::collectEphemeralValues(
   completeEphemeralValues(Visited, Worklist, EphValues);
 }
 
+static bool extendsConvergenceOutsideLoop(const Instruction &I, const Loop *L) {
+  if (!L)
+    return false;
+  if (!isa<ConvergenceControlInst>(I))
+    return false;
+  for (const auto *U : I.users()) {
+    if (!L->contains(cast<Instruction>(U)))
+      return true;
+  }
+  return false;
+}
+
 /// Fill in the current structure with information gleaned from the specified
 /// block.
 void CodeMetrics::analyzeBasicBlock(
     const BasicBlock *BB, const TargetTransformInfo &TTI,
-    const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) {
+    const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO,
+    const Loop *L) {
   ++NumBlocks;
   InstructionCost NumInstsBeforeThisBB = NumInsts;
   for (const Instruction &I : *BB) {
@@ -163,19 +177,38 @@ void CodeMetrics::analyzeBasicBlock(
     if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
       ++NumVectorInsts;
 
-    if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+    if (I.getType()->isTokenTy() && !isa<ConvergenceControlInst>(I) &&
+        I.isUsedOutsideOfBlock(BB)) {
+      LLVM_DEBUG(dbgs() << I
+                        << "\n  Cannot duplicate a token value used outside "
+                           "the current block (except convergence control).\n");
       notDuplicatable = true;
-
-    if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
-      if (CI->cannotDuplicate())
-        notDuplicatable = true;
-      if (CI->isConvergent())
-        convergent = true;
     }
 
-    if (const InvokeInst *InvI = dyn_cast<InvokeInst>(&I))
-      if (InvI->cannotDuplicate())
+    if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
+      if (CB->cannotDuplicate())
         notDuplicatable = true;
+      // Compute a meet over the visited blocks for the following partial order:
+      //
+      // None -> { Controlled, ExtendedLoop, Uncontrolled}
+      // Controlled -> ExtendedLoop
+      if (Convergence <= ConvergenceKind::Controlled && CB->isConvergent()) {
+        if (isa<ConvergenceControlInst>(CB) ||
+            CB->getConvergenceControlToken()) {
+          assert(Convergence != ConvergenceKind::Uncontrolled);
+          LLVM_DEBUG(dbgs() << "Found controlled convergence:\n" << I << "\n");
+          if (extendsConvergenceOutsideLoop(I, L))
+            Convergence = ConvergenceKind::ExtendedLoop;
+          else {
+            assert(Convergence != ConvergenceKind::ExtendedLoop);
+            Convergence = ConvergenceKind::Controlled;
+          }
+        } else {
+          assert(Convergence == ConvergenceKind::None);
+          Convergence = ConvergenceKind::Uncontrolled;
+        }
+      }
+    }
 
     NumInsts += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
   }
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 369ab087ffc0..c34c4974382e 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -1105,6 +1105,26 @@ int llvm::getIntLoopAttribute(const Loop *TheLoop, StringRef Name,
   return getOptionalIntLoopAttribute(TheLoop, Name).value_or(Default);
 }
 
+CallBase *llvm::getLoopConvergenceHeart(const Loop *TheLoop) {
+  BasicBlock *H = TheLoop->getHeader();
+  for (Instruction &II : *H) {
+    if (auto *CB = dyn_cast<CallBase>(&II)) {
+      if (!CB->isConvergent())
+        continue;
+      // This is the heart if it uses a token defined outside the loop. The
+      // verifier has already checked that only the loop intrinsic can use such
+      // a token.
+      if (auto *Token = CB->getConvergenceControlToken()) {
+        auto *TokenDef = cast<Instruction>(Token);
+        if (!TheLoop->contains(TokenDef->getParent()))
+          return CB;
+      }
+      return nullptr;
+    }
+  }
+  return nullptr;
+}
+
 bool llvm::isFinite(const Loop *L) {
   return L->getHeader()->getParent()->willReturn();
 }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 08138a5e2f2d..782c28c94483 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7296,10 +7296,13 @@ static bool isGuaranteedNotToBeUndefOrPoison(
         isa<ConstantPointerNull>(C) || isa<Function>(C))
       return true;
 
-    if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C))
-      return (!includesUndef(Kind) ? !C->containsPoisonElement()
-                                   : !C->containsUndefOrPoisonElement()) &&
-             !C->containsConstantExpression();
+    if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C)) {
+      if (includesUndef(Kind) && C->containsUndefElement())
+        return false;
+      if (includesPoison(Kind) && C->containsPoisonElement())
+        return false;
+      return !C->containsConstantExpression();
+    }
   }
 
   // Strip cast operations from a pointer value.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 917094267d05..30728ed58750 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -68,6 +68,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::sqrt: // Begin floating-point.
   case Intrinsic::sin:
   case Intrinsic::cos:
+  case Intrinsic::tan:
   case Intrinsic::exp:
   case Intrinsic::exp2:
   case Intrinsic::log:
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index d3ab306904da..7d7fe19568e8 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -604,6 +604,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(aarch64_vector_pcs);
   KEYWORD(aarch64_sve_vector_pcs);
   KEYWORD(aarch64_sme_preservemost_from_x0);
+  KEYWORD(aarch64_sme_preservemost_from_x1);
   KEYWORD(aarch64_sme_preservemost_from_x2);
   KEYWORD(msp430_intrcc);
   KEYWORD(avr_intrcc);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 07c8aa23fc5e..f0fde9ae4df5 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2153,6 +2153,7 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'aarch64_vector_pcs'
 ///   ::= 'aarch64_sve_vector_pcs'
 ///   ::= 'aarch64_sme_preservemost_from_x0'
+///   ::= 'aarch64_sme_preservemost_from_x1'
 ///   ::= 'aarch64_sme_preservemost_from_x2'
 ///   ::= 'msp430_intrcc'
 ///   ::= 'avr_intrcc'
@@ -2212,6 +2213,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_aarch64_sme_preservemost_from_x0:
     CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0;
     break;
+  case lltok::kw_aarch64_sme_preservemost_from_x1:
+    CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1;
+    break;
   case lltok::kw_aarch64_sme_preservemost_from_x2:
     CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2;
     break;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
index b4765fb280f9..66b1c5f8ca82 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements CombinerHelper for G_EXTRACT_VECTOR_ELT.
+// This file implements CombinerHelper for G_EXTRACT_VECTOR_ELT,
+// G_INSERT_VECTOR_ELT, and G_VSCALE
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
@@ -400,3 +401,86 @@ bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI,
 
   return false;
 }
+
+bool CombinerHelper::matchAddOfVScale(const MachineOperand &MO,
+                                      BuildFnTy &MatchInfo) {
+  GAdd *Add = cast<GAdd>(MRI.getVRegDef(MO.getReg()));
+  GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getLHSReg()));
+  GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getRHSReg()));
+
+  Register Dst = Add->getReg(0);
+
+  if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)) ||
+      !MRI.hasOneNonDBGUse(RHSVScale->getReg(0)))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildVScale(Dst, LHSVScale->getSrc() + RHSVScale->getSrc());
+  };
+
+  return true;
+}
+
+bool CombinerHelper::matchMulOfVScale(const MachineOperand &MO,
+                                      BuildFnTy &MatchInfo) {
+  GMul *Mul = cast<GMul>(MRI.getVRegDef(MO.getReg()));
+  GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Mul->getLHSReg()));
+
+  std::optional<APInt> MaybeRHS = getIConstantVRegVal(Mul->getRHSReg(), MRI);
+  if (!MaybeRHS)
+    return false;
+
+  Register Dst = MO.getReg();
+
+  if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildVScale(Dst, LHSVScale->getSrc() * *MaybeRHS);
+  };
+
+  return true;
+}
+
+bool CombinerHelper::matchSubOfVScale(const MachineOperand &MO,
+                                      BuildFnTy &MatchInfo) {
+  GSub *Sub = cast<GSub>(MRI.getVRegDef(MO.getReg()));
+  GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Sub->getRHSReg()));
+
+  Register Dst = MO.getReg();
+  LLT DstTy = MRI.getType(Dst);
+
+  if (!MRI.hasOneNonDBGUse(RHSVScale->getReg(0)) ||
+      !isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, DstTy}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto VScale = B.buildVScale(DstTy, -RHSVScale->getSrc());
+    B.buildAdd(Dst, Sub->getLHSReg(), VScale, Sub->getFlags());
+  };
+
+  return true;
+}
+
+bool CombinerHelper::matchShlOfVScale(const MachineOperand &MO,
+                                      BuildFnTy &MatchInfo) {
+  GShl *Shl = cast<GShl>(MRI.getVRegDef(MO.getReg()));
+  GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Shl->getSrcReg()));
+
+  std::optional<APInt> MaybeRHS = getIConstantVRegVal(Shl->getShiftReg(), MRI);
+  if (!MaybeRHS)
+    return false;
+
+  Register Dst = MO.getReg();
+  LLT DstTy = MRI.getType(Dst);
+
+  if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)) ||
+      !isLegalOrBeforeLegalizer({TargetOpcode::G_VSCALE, DstTy}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildVScale(Dst, LHSVScale->getSrc().shl(*MaybeRHS));
+  };
+
+  return true;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 6f0cae2edab1..9830b521797c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -449,6 +449,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     RTLIBCASE(SIN_F);
   case TargetOpcode::G_FCOS:
     RTLIBCASE(COS_F);
+  case TargetOpcode::G_FTAN:
+    RTLIBCASE(TAN_F);
   case TargetOpcode::G_FLOG10:
     RTLIBCASE(LOG10_F);
   case TargetOpcode::G_FLOG:
@@ -1037,6 +1039,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
   case TargetOpcode::G_FREM:
   case TargetOpcode::G_FCOS:
   case TargetOpcode::G_FSIN:
+  case TargetOpcode::G_FTAN:
   case TargetOpcode::G_FLOG10:
   case TargetOpcode::G_FLOG:
   case TargetOpcode::G_FLOG2:
@@ -2893,6 +2896,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_FFLOOR:
   case TargetOpcode::G_FCOS:
   case TargetOpcode::G_FSIN:
+  case TargetOpcode::G_FTAN:
   case TargetOpcode::G_FLOG10:
   case TargetOpcode::G_FLOG:
   case TargetOpcode::G_FLOG2:
@@ -4659,6 +4663,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_INTRINSIC_TRUNC:
   case G_FCOS:
   case G_FSIN:
+  case G_FTAN:
   case G_FSQRT:
   case G_BSWAP:
   case G_BITREVERSE:
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index e8438be94b3c..129e6963aef3 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -833,6 +833,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
   case TargetOpcode::G_FREM:
   case TargetOpcode::G_FSIN:
   case TargetOpcode::G_FCOS:
+  case TargetOpcode::G_FTAN:
   case TargetOpcode::G_FMA:
   case TargetOpcode::G_FMAD:
     if (SNaN)
@@ -1713,6 +1714,7 @@ bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) {
   case TargetOpcode::G_FREM:
   case TargetOpcode::G_FRINT:
   case TargetOpcode::G_FSIN:
+  case TargetOpcode::G_FTAN:
   case TargetOpcode::G_FSQRT:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_INTRINSIC_ROUND:
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 3397bd0a6060..a808a541103f 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1339,14 +1339,13 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   if (SrcIdx && DstIdx)
     return false;
 
-  [[maybe_unused]] const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
+  const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
   const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
   if (!DefMI->isImplicitDef()) {
     if (DstReg.isPhysical()) {
       Register NewDstReg = DstReg;
 
-      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
-                                              DefMI->getOperand(0).getSubReg());
+      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), DefSubIdx);
       if (NewDstIdx)
         NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9a5359015439..02cd125eeff0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4041,17 +4041,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     return DAG.getNode(ISD::ADD, DL, VT, N0, SExt);
   }
 
-  // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X)
-  if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
-    if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
-      SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
-      SDValue S0 = N1.getOperand(0);
-      if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0))
-        if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
-          if (C->getAPIntValue() == (BitWidth - 1))
-            return DAG.getNode(ISD::ABS, DL, VT, S0);
-    }
-  }
+  // fold B = sra (A, size(A)-1); sub (xor (A, B), B) -> (abs A)
+  if (hasOperation(ISD::ABS, VT) &&
+      sd_match(N1, m_Sra(m_Value(A), m_SpecificInt(BitWidth - 1))) &&
+      sd_match(N0, m_Xor(m_Specific(A), m_Specific(N1))))
+    return DAG.getNode(ISD::ABS, DL, VT, A);
 
   // If the relocation model supports it, consider symbol offsets.
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 8cd2bb60d81f..27c45cab2e0d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4514,6 +4514,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                     RTLIB::COS_F80, RTLIB::COS_F128,
                     RTLIB::COS_PPCF128, Results);
     break;
+  case ISD::FTAN:
+  case ISD::STRICT_FTAN:
+    ExpandFPLibCall(Node, RTLIB::TAN_F32, RTLIB::TAN_F64, RTLIB::TAN_F80,
+                    RTLIB::TAN_F128, RTLIB::TAN_PPCF128, Results);
+    break;
   case ISD::FSINCOS:
     // Expand into sincos libcall.
     ExpandSinCosLibCall(Node, Results);
@@ -5468,6 +5473,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::FSQRT:
   case ISD::FSIN:
   case ISD::FCOS:
+  case ISD::FTAN:
   case ISD::FLOG:
   case ISD::FLOG2:
   case ISD::FLOG10:
@@ -5492,6 +5498,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::STRICT_FSQRT:
   case ISD::STRICT_FSIN:
   case ISD::STRICT_FCOS:
+  case ISD::STRICT_FTAN:
   case ISD::STRICT_FLOG:
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FLOG10:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index fb1424f75e09..aa116c9de5d8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -131,6 +131,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::STRICT_FSUB:
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
+    case ISD::STRICT_FTAN:
+    case ISD::FTAN:        R = SoftenFloatRes_FTAN(N); break;
     case ISD::STRICT_FTRUNC:
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
     case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
@@ -774,6 +776,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
                                                RTLIB::SUB_PPCF128));
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_FTAN(SDNode *N) {
+  return SoftenFloatRes_Unary(
+      N, GetFPLibCall(N->getValueType(0), RTLIB::TAN_F32, RTLIB::TAN_F64,
+                      RTLIB::TAN_F80, RTLIB::TAN_F128, RTLIB::TAN_PPCF128));
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
   return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
                                               RTLIB::TRUNC_F32,
@@ -1330,7 +1338,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
 #endif
     report_fatal_error("Do not know how to expand the result of this "
                        "operator!");
-
+    // clang-format off
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
@@ -1399,6 +1407,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
   case ISD::STRICT_FSUB:
   case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
+  case ISD::STRICT_FTAN:
+  case ISD::FTAN:       ExpandFloatRes_FTAN(N, Lo, Hi); break;
   case ISD::STRICT_FTRUNC:
   case ISD::FTRUNC:     ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
   case ISD::LOAD:       ExpandFloatRes_LOAD(N, Lo, Hi); break;
@@ -1408,6 +1418,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
   case ISD::STRICT_FREM:
   case ISD::FREM:       ExpandFloatRes_FREM(N, Lo, Hi); break;
+    // clang-format on
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -1768,6 +1779,15 @@ void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                         RTLIB::SUB_PPCF128), Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FTAN(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  ExpandFloatRes_Unary(N,
+                       GetFPLibCall(N->getValueType(0), RTLIB::TAN_F32,
+                                    RTLIB::TAN_F64, RTLIB::TAN_F80,
+                                    RTLIB::TAN_F128, RTLIB::TAN_PPCF128),
+                       Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
@@ -2479,6 +2499,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FSIN:
     case ISD::FSQRT:
     case ISD::FTRUNC:
+    case ISD::FTAN:
     case ISD::FCANONICALIZE: R = PromoteFloatRes_UnaryOp(N); break;
 
     // Binary FP Operations
@@ -2914,6 +2935,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
+  case ISD::FTAN:
   case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break;
 
   // Binary FP Operations
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index bec9cb49b586..2350b562a034 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -586,6 +586,7 @@ private:
   SDValue SoftenFloatRes_FSIN(SDNode *N);
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
+  SDValue SoftenFloatRes_FTAN(SDNode *N);
   SDValue SoftenFloatRes_FTRUNC(SDNode *N);
   SDValue SoftenFloatRes_LOAD(SDNode *N);
   SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N);
@@ -635,6 +636,7 @@ private:
                             SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_Binary(SDNode *N, RTLIB::Libcall LC,
                              SDValue &Lo, SDValue &Hi);
+  // clang-format off
   void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FMINNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FMAXNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -667,9 +669,11 @@ private:
   void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FTAN      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FTRUNC    (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_LOAD      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi);
+  // clang-format on
 
   // Float Operand Expansion.
   bool ExpandFloatOperand(SDNode *N, unsigned OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 6acbc044d673..8cdb4ba0ade6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -397,6 +397,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FSQRT:
   case ISD::FSIN:
   case ISD::FCOS:
+  case ISD::FTAN:
   case ISD::FLDEXP:
   case ISD::FPOWI:
   case ISD::FPOW:
@@ -506,7 +507,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         break;                                                                 \
     }                                                                          \
     /* Defer non-vector results to LegalizeDAG. */                             \
-    if (!Node->getValueType(0).isVector()) {                                   \
+    if (!Node->getValueType(0).isVector() &&                                   \
+        Node->getValueType(0) != MVT::Other) {                                 \
       Action = TargetLowering::Legal;                                          \
       break;                                                                   \
     }                                                                          \
@@ -990,11 +992,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     break;
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
-    if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) {
-      Results.push_back(Expanded);
-      return;
-    }
-    break;
+    Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG));
+    return;
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 361416edb554..92ce3b17ed6c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -108,6 +108,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
+  case ISD::FTAN:
   case ISD::FTRUNC:
   case ISD::SIGN_EXTEND:
   case ISD::SINT_TO_FP:
@@ -1140,6 +1141,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT: case ISD::VP_SQRT:
+  case ISD::FTAN:
   case ISD::FTRUNC:
   case ISD::VP_FROUNDTOZERO:
   case ISD::SINT_TO_FP:
@@ -4400,6 +4402,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
+  case ISD::FTAN:
   case ISD::FTRUNC:
     if (unrollExpandedOp())
       break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4a6a431696b5..e176cf2cc2a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5375,6 +5375,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
   case ISD::FREM:
   case ISD::FSIN:
   case ISD::FCOS:
+  case ISD::FTAN:
   case ISD::FMA:
   case ISD::FMAD: {
     if (SNaN)
@@ -6332,7 +6333,8 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
 }
 
 SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
-                                             EVT VT, ArrayRef<SDValue> Ops) {
+                                             EVT VT, ArrayRef<SDValue> Ops,
+                                             SDNodeFlags Flags) {
   // If the opcode is a target-specific ISD node, there's nothing we can
   // do here and the operand rules may not line up with the below, so
   // bail early.
@@ -6689,7 +6691,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
     }
 
     // Constant fold the scalar operands.
-    SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps);
+    SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);
 
     // Legalize the (integer) scalar constant if necessary.
     if (LegalSVT != SVT)
@@ -7260,7 +7262,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   }
 
   // Perform trivial constant folding.
-  if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}))
+  if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
     return SV;
 
   // Canonicalize an UNDEF to the RHS, even over a constant.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ba76456b5836..2f3626f1c820 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1684,7 +1684,7 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values,
           if (!FragmentExpr)
             continue;
           SDDbgValue *SDV = DAG.getVRegDbgValue(
-              Var, *FragmentExpr, RegAndSize.first, false, DbgLoc, SDNodeOrder);
+              Var, *FragmentExpr, RegAndSize.first, false, DbgLoc, Order);
           DAG.AddDbgValue(SDV, false);
           Offset += RegisterSize;
         }
@@ -1699,11 +1699,10 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values,
   }
 
   // We have created a SDDbgOperand for each Value in Values.
-  // Should use Order instead of SDNodeOrder?
   assert(!LocationOps.empty());
-  SDDbgValue *SDV = DAG.getDbgValueList(Var, Expr, LocationOps, Dependencies,
-                                        /*IsIndirect=*/false, DbgLoc,
-                                        SDNodeOrder, IsVariadic);
+  SDDbgValue *SDV =
+      DAG.getDbgValueList(Var, Expr, LocationOps, Dependencies,
+                          /*IsIndirect=*/false, DbgLoc, Order, IsVariadic);
   DAG.AddDbgValue(SDV, /*isParameter=*/false);
   return true;
 }
@@ -6742,6 +6741,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::fabs:
   case Intrinsic::sin:
   case Intrinsic::cos:
+  case Intrinsic::tan:
   case Intrinsic::exp10:
   case Intrinsic::floor:
   case Intrinsic::ceil:
@@ -6759,6 +6759,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     case Intrinsic::fabs:         Opcode = ISD::FABS;          break;
     case Intrinsic::sin:          Opcode = ISD::FSIN;          break;
     case Intrinsic::cos:          Opcode = ISD::FCOS;          break;
+    case Intrinsic::tan:          Opcode = ISD::FTAN;          break;
     case Intrinsic::exp10:        Opcode = ISD::FEXP10;        break;
     case Intrinsic::floor:        Opcode = ISD::FFLOOR;        break;
     case Intrinsic::ceil:         Opcode = ISD::FCEIL;         break;
@@ -9160,6 +9161,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         if (visitUnaryFloatCall(I, ISD::FCOS))
           return;
         break;
+      case LibFunc_tan:
+      case LibFunc_tanf:
+      case LibFunc_tanl:
+        if (visitUnaryFloatCall(I, ISD::FTAN))
+          return;
+        break;
       case LibFunc_sqrt:
       case LibFunc_sqrtf:
       case LibFunc_sqrtl:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 2198c2354483..52da24b59451 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -210,6 +210,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FCOS:                       return "fcos";
   case ISD::STRICT_FCOS:                return "strict_fcos";
   case ISD::FSINCOS:                    return "fsincos";
+  case ISD::FTAN:                       return "ftan";
+  case ISD::STRICT_FTAN:                return "strict_ftan";
   case ISD::FTRUNC:                     return "ftrunc";
   case ISD::STRICT_FTRUNC:              return "strict_ftrunc";
   case ISD::FFLOOR:                     return "ffloor";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f856c8a51984..e1c1a6b09b11 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8427,10 +8427,6 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
   bool IsMax = Opc == ISD::FMAXIMUM;
   SDNodeFlags Flags = N->getFlags();
 
-  if (VT.isVector() &&
-      isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
-    return SDValue();
-
   // First, implement comparison not propagating NaN. If no native fmin or fmax
   // available, use plain select with setcc instead.
   SDValue MinMax;
@@ -8447,6 +8443,9 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
   } else if (isOperationLegalOrCustom(CompOpc, VT)) {
     MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags);
   } else {
+    if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+      return DAG.UnrollVectorOp(N);
+
     // NaN (if exists) will be propagated later, so orderness doesn't matter.
     SDValue Compare =
         DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
@@ -9159,6 +9158,7 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
   if (!IsNegative && isOperationLegal(ISD::SUB, VT) &&
       isOperationLegal(ISD::SMAX, VT)) {
     SDValue Zero = DAG.getConstant(0, dl, VT);
+    Op = DAG.getFreeze(Op);
     return DAG.getNode(ISD::SMAX, dl, VT, Op,
                        DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
   }
@@ -9175,8 +9175,8 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
   // 0 - abs(x) -> smin(x, sub(0,x))
   if (IsNegative && isOperationLegal(ISD::SUB, VT) &&
       isOperationLegal(ISD::SMIN, VT)) {
-    Op = DAG.getFreeze(Op);
     SDValue Zero = DAG.getConstant(0, dl, VT);
+    Op = DAG.getFreeze(Op);
     return DAG.getNode(ISD::SMIN, dl, VT, Op,
                        DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
   }
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 3aec7049e0cc..8240a1fd7e2f 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -141,6 +141,7 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::EXP10_F128, "exp10f128");
     setLibcallName(RTLIB::SIN_F128, "sinf128");
     setLibcallName(RTLIB::COS_F128, "cosf128");
+    setLibcallName(RTLIB::TAN_F128, "tanf128");
     setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
     setLibcallName(RTLIB::POW_F128, "powf128");
     setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
@@ -1015,7 +1016,8 @@ void TargetLoweringBase::initActions() {
   setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::FEXP,
                       ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR, ISD::FNEARBYINT,
                       ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, ISD::LROUND,
-                      ISD::LLROUND, ISD::LRINT, ISD::LLRINT, ISD::FROUNDEVEN},
+                      ISD::LLROUND, ISD::LRINT, ISD::LLRINT, ISD::FROUNDEVEN,
+                      ISD::FTAN},
                      {MVT::f32, MVT::f64, MVT::f128}, Expand);
 
   // Default ISD::TRAP to expand (which turns it into abort).
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 004622061120..f44a6a472cb6 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -1183,8 +1183,7 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID,
   StubMap::const_iterator i = Stubs.find(Value);
   if (i != Stubs.end()) {
     resolveRelocation(Section, Offset,
-                      (uint64_t)Section.getAddressWithOffset(i->second),
-                      RelType, 0);
+                      Section.getLoadAddressWithOffset(i->second), RelType, 0);
     LLVM_DEBUG(dbgs() << " Stub function found\n");
   } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
     // Create a new stub function.
@@ -1217,8 +1216,7 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID,
       addRelocationForSection(REmovk_g0, Value.SectionID);
     }
     resolveRelocation(Section, Offset,
-                      reinterpret_cast<uint64_t>(Section.getAddressWithOffset(
-                          Section.getStubOffset())),
+                      Section.getLoadAddressWithOffset(Section.getStubOffset()),
                       RelType, 0);
     Section.advanceStubOffset(getMaxStubSize());
   }
@@ -1349,10 +1347,9 @@ RuntimeDyldELF::processRelocationRef(
       // Look for an existing stub.
       StubMap::const_iterator i = Stubs.find(Value);
       if (i != Stubs.end()) {
-        resolveRelocation(
-            Section, Offset,
-            reinterpret_cast<uint64_t>(Section.getAddressWithOffset(i->second)),
-            RelType, 0);
+        resolveRelocation(Section, Offset,
+                          Section.getLoadAddressWithOffset(i->second), RelType,
+                          0);
         LLVM_DEBUG(dbgs() << " Stub function found\n");
       } else {
         // Create a new stub function.
@@ -1367,10 +1364,10 @@ RuntimeDyldELF::processRelocationRef(
         else
           addRelocationForSection(RE, Value.SectionID);
 
-        resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>(
-                                               Section.getAddressWithOffset(
-                                                   Section.getStubOffset())),
-                          RelType, 0);
+        resolveRelocation(
+            Section, Offset,
+            Section.getLoadAddressWithOffset(Section.getStubOffset()), RelType,
+            0);
         Section.advanceStubOffset(getMaxStubSize());
       }
     } else {
@@ -1609,8 +1606,7 @@ RuntimeDyldELF::processRelocationRef(
         if (i != Stubs.end()) {
           // Symbol function stub already created, just relocate to it
           resolveRelocation(Section, Offset,
-                            reinterpret_cast<uint64_t>(
-                                Section.getAddressWithOffset(i->second)),
+                            Section.getLoadAddressWithOffset(i->second),
                             RelType, 0);
           LLVM_DEBUG(dbgs() << " Stub function found\n");
         } else {
@@ -1652,10 +1648,10 @@ RuntimeDyldELF::processRelocationRef(
             addRelocationForSection(REl, Value.SectionID);
           }
 
-          resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>(
-                                                 Section.getAddressWithOffset(
-                                                     Section.getStubOffset())),
-                            RelType, 0);
+          resolveRelocation(
+              Section, Offset,
+              Section.getLoadAddressWithOffset(Section.getStubOffset()),
+              RelType, 0);
           Section.advanceStubOffset(getMaxStubSize());
         }
         if (IsExtern || (AbiVariant == 2 && Value.SectionID != SectionID)) {
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2c4b45255d05..92213e19c9d9 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3961,7 +3961,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
   UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
 
   // Loop is not unrollable if the loop contains certain instructions.
-  if (!UCE.canUnroll() || UCE.Convergent) {
+  if (!UCE.canUnroll()) {
     LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
     return 1;
   }
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 7a5f18fe2cbd..0bf8be9ac55f 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -326,6 +326,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
     Out << "aarch64_sme_preservemost_from_x0";
     break;
+  case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1:
+    Out << "aarch64_sme_preservemost_from_x1";
+    break;
   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
     Out << "aarch64_sme_preservemost_from_x2";
     break;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a7ed2de6e8a5..2f4b8351e747 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5368,8 +5368,8 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
     return DL.empty() ? std::string("G1") : (DL + "-G1").str();
   }
 
-  if (T.isRISCV64()) {
-    // Make i32 a native type for 64-bit RISC-V.
+  if (T.isLoongArch64() || T.isRISCV64()) {
+    // Make i32 a native type for 64-bit LoongArch and RISC-V.
     auto I = DL.find("-n64-");
     if (I != StringRef::npos)
       return (DL.take_front(I) + "-n32:64-" + DL.drop_front(I + 5)).str();
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 985f9351f4a3..788e92f94b26 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -877,7 +877,7 @@ void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports,
       break;
     case wasm::WASM_EXTERNAL_TABLE:
       W->OS << char(Import.Table.ElemType);
-      encodeULEB128(0, W->OS);           // flags
+      encodeULEB128(Import.Table.Limits.Flags, W->OS);
       encodeULEB128(NumElements, W->OS); // initial
       break;
     case wasm::WASM_EXTERNAL_TAG:
@@ -1022,7 +1022,8 @@ void WasmObjectWriter::writeElemSection(
     encodeULEB128(TableNumber, W->OS); // the table number
 
   // init expr for starting offset
-  W->OS << char(wasm::WASM_OPCODE_I32_CONST);
+  W->OS << char(is64Bit() ? wasm::WASM_OPCODE_I64_CONST
+                          : wasm::WASM_OPCODE_I32_CONST);
   encodeSLEB128(InitialTableOffset, W->OS);
   W->OS << char(wasm::WASM_OPCODE_END);
 
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 2b6bdbf24afa..cbc55a145e0e 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -586,6 +586,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
     return "gfx1150";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151:
     return "gfx1151";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152:
+    return "gfx1152";
 
   // AMDGCN GFX12.
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200:
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 8e2a9481c922..0fee299994bc 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -611,6 +611,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1103, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1150, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1151, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1152, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 316d05bf1dc3..8dd060d0151a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -139,6 +139,7 @@
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 50682ca4970f..dad97146a9f6 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -59,6 +59,7 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
 MODULE_PASS("dxil-upgrade", DXILUpgradePass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
+MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::Disable))
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
 MODULE_PASS("globalopt", GlobalOptPass())
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index fc3be716087e..693897f874a2 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -690,7 +690,7 @@ Error RawMemProfReader::readNextRecord(
       return F;
     auto Iter = this->GuidToSymbolName.find(F.Function);
     assert(Iter != this->GuidToSymbolName.end());
-    F.SymbolName = Iter->getSecond();
+    F.SymbolName = std::make_unique<std::string>(Iter->getSecond());
     return F;
   };
   return MemProfReader::readNextRecord(GuidRecord, IdToFrameCallback);
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index fcefdef992be..7360901f2962 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -867,21 +867,16 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
   // Any intermediate directories we create should be accessible by
   // the owner, even if Perms says otherwise for the final path.
   const auto NewDirectoryPerms = ResolvedPerms | sys::fs::owner_all;
+
+  StringRef Name = *I;
   while (true) {
-    StringRef Name = *I;
-    detail::InMemoryNode *Node = Dir->getChild(Name);
+    Name = *I;
     ++I;
+    if (I == E)
+      break;
+    detail::InMemoryNode *Node = Dir->getChild(Name);
     if (!Node) {
-      if (I == E) {
-        // End of the path.
-        Dir->addChild(
-            Name, MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime,
-                            std::move(Buffer), ResolvedUser, ResolvedGroup,
-                            ResolvedType, ResolvedPerms}));
-        return true;
-      }
-
-      // Create a new directory. Use the path up to here.
+      // This isn't the last element, so we create a new directory.
       Status Stat(
           StringRef(Path.str().begin(), Name.end() - Path.str().begin()),
           getDirectoryID(Dir->getUniqueID(), Name),
@@ -891,27 +886,33 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
           Name, std::make_unique<detail::InMemoryDirectory>(std::move(Stat))));
       continue;
     }
+    // Creating file under another file.
+    if (!isa<detail::InMemoryDirectory>(Node))
+      return false;
+    Dir = cast<detail::InMemoryDirectory>(Node);
+  }
+  detail::InMemoryNode *Node = Dir->getChild(Name);
+  if (!Node) {
+    Dir->addChild(Name,
+                  MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime,
+                            std::move(Buffer), ResolvedUser, ResolvedGroup,
+                            ResolvedType, ResolvedPerms}));
+    return true;
+  }
+  if (isa<detail::InMemoryDirectory>(Node))
+    return ResolvedType == sys::fs::file_type::directory_file;
 
-    if (auto *NewDir = dyn_cast<detail::InMemoryDirectory>(Node)) {
-      Dir = NewDir;
-    } else {
-      assert((isa<detail::InMemoryFile>(Node) ||
-              isa<detail::InMemoryHardLink>(Node)) &&
-             "Must be either file, hardlink or directory!");
-
-      // Trying to insert a directory in place of a file.
-      if (I != E)
-        return false;
+  assert((isa<detail::InMemoryFile>(Node) ||
+          isa<detail::InMemoryHardLink>(Node)) &&
+         "Must be either file, hardlink or directory!");
 
-      // Return false only if the new file is different from the existing one.
-      if (auto Link = dyn_cast<detail::InMemoryHardLink>(Node)) {
-        return Link->getResolvedFile().getBuffer()->getBuffer() ==
-               Buffer->getBuffer();
-      }
-      return cast<detail::InMemoryFile>(Node)->getBuffer()->getBuffer() ==
-             Buffer->getBuffer();
-    }
+  // Return false only if the new file is different from the existing one.
+  if (auto *Link = dyn_cast<detail::InMemoryHardLink>(Node)) {
+    return Link->getResolvedFile().getBuffer()->getBuffer() ==
+           Buffer->getBuffer();
   }
+  return cast<detail::InMemoryFile>(Node)->getBuffer()->getBuffer() ==
+         Buffer->getBuffer();
 }
 
 bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 4b2ce0d73949..5708b6173750 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -85,6 +85,10 @@ def SMEUnsupported : AArch64Unsupported {
                       SME2Unsupported.F);
 }
 
+def MTEUnsupported : AArch64Unsupported {
+  let F = [HasMTE];
+}
+
 let F = [HasPAuth, HasPAuthLR] in
 def PAUnsupported : AArch64Unsupported;
 
@@ -109,6 +113,7 @@ include "AArch64SchedNeoverseN1.td"
 include "AArch64SchedNeoverseN2.td"
 include "AArch64SchedNeoverseV1.td"
 include "AArch64SchedNeoverseV2.td"
+include "AArch64SchedOryon.td"
 
 include "AArch64Processors.td"
 
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 32646c6ee689..941990c53c4a 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -589,6 +589,14 @@ def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
                                                  (sequence "X%u",19, 28),
                                                  LR, FP)>;
 
+// SME ABI support routines such as __arm_get_current_vg preserve most registers.
+def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
+                          : CalleeSavedRegs<(add (sequence "Z%u", 0, 31),
+                                                 (sequence "P%u", 0, 15),
+                                                 (sequence "X%u", 1, 15),
+                                                 (sequence "X%u",19, 28),
+                                                 LR, FP)>;
+
 // SME ABI support routines __arm_sme_state preserves most registers.
 def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
                           : CalleeSavedRegs<(add (sequence "Z%u", 0, 31),
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 8d16709114df..a759efcd9441 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -617,6 +617,27 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
                                     FeatureLdpAlignedOnly,
                                     FeatureStpAlignedOnly]>;
 
+def TuneOryon  : SubtargetFeature<"oryon-1", "ARMProcFamily",
+                                    "Oryon",
+                                    "Nuvia Inc Oryon processors", [
+                                    FeatureCrypto,
+                                    FeatureFPARMv8,
+                                    FeatureNEON,
+                                    FeatureFuseAES,
+                                    FeatureFuseAdrpAdd,
+                                    FeatureEnableSelectOptimize,
+                                    FeatureFuseCryptoEOR,
+                                    FeatureFuseAddress,
+                                    FeatureSM4,
+                                    FeatureSHA2,
+                                    FeatureSHA3,
+                                    FeatureAES,
+                                    FeatureFullFP16,
+                                    FeatureFP16FML,
+                                    FeaturePerfMon,
+                                    FeatureSPE,
+                                    FeaturePostRAScheduler,
+                                    HasV8_6aOps]>;
 
 def ProcessorFeatures {
   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
@@ -806,6 +827,11 @@ def ProcessorFeatures {
                                      FeatureSHA3, FeatureAES, FeatureCSSC,
                                      FeatureWFxT, FeatureFullFP16];
 
+  list<SubtargetFeature> Oryon = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+                                     FeatureCrypto, FeatureRandGen,
+                                     FeaturePAuth, FeatureSM4, FeatureSHA2,
+                                     FeatureSHA3, FeatureAES];
+
   // ETE and TRBE are future architecture extensions. We temporarily enable them
   // by default for users targeting generic AArch64. The extensions do not
   // affect code generated by the compiler and can be used only by explicitly
@@ -988,3 +1014,7 @@ def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A,
 
 def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B,
                      [TuneAmpere1B]>;
+
+// Qualcomm Oryon
+def : ProcessorModel<"oryon-1", OryonModel, ProcessorFeatures.Oryon,
+                       [TuneOryon]>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index e97d7e3b6ed8..cc50b59dd8d7 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -107,13 +107,22 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (MF->getFunction().getCallingConv() ==
           CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0)
     report_fatal_error(
-        "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is "
-        "only supported to improve calls to SME ACLE save/restore/disable-za "
+        "Calling convention "
+        "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is only "
+        "supported to improve calls to SME ACLE save/restore/disable-za "
         "functions, and is not intended to be used beyond that scope.");
   if (MF->getFunction().getCallingConv() ==
+      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)
+    report_fatal_error(
+        "Calling convention "
+        "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 is "
+        "only supported to improve calls to SME ACLE __arm_get_current_vg "
+        "function, and is not intended to be used beyond that scope.");
+  if (MF->getFunction().getCallingConv() ==
           CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)
     report_fatal_error(
-        "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is "
+        "Calling convention "
+        "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is "
         "only supported to improve calls to SME ACLE __arm_sme_state "
         "and is not intended to be used beyond that scope.");
   if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
@@ -153,13 +162,22 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
   if (MF->getFunction().getCallingConv() ==
           CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0)
     report_fatal_error(
-        "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is "
+        "Calling convention "
+        "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is "
         "only supported to improve calls to SME ACLE save/restore/disable-za "
         "functions, and is not intended to be used beyond that scope.");
   if (MF->getFunction().getCallingConv() ==
+      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)
+    report_fatal_error(
+        "Calling convention "
+        "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 is "
+        "only supported to improve calls to SME ACLE __arm_get_current_vg "
+        "function, and is not intended to be used beyond that scope.");
+  if (MF->getFunction().getCallingConv() ==
           CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)
     report_fatal_error(
-        "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is "
+        "Calling convention "
+        "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is "
         "only supported to improve calls to SME ACLE __arm_sme_state "
         "and is not intended to be used beyond that scope.");
   if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
@@ -236,6 +254,8 @@ AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF,
         "Calling convention SVE_VectorCall is unsupported on Darwin.");
   if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0)
     return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0_RegMask;
+  if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)
+    return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1_RegMask;
   if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)
     return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2_RegMask;
   if (CC == CallingConv::CFGuard_Check)
@@ -282,6 +302,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                : CSR_AArch64_SVE_AAPCS_RegMask;
   if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0)
     return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0_RegMask;
+  if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)
+    return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1_RegMask;
   if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)
     return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2_RegMask;
   if (CC == CallingConv::CFGuard_Check)
@@ -643,6 +665,7 @@ bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF,
   case CallingConv::AArch64_VectorCall:
   case CallingConv::AArch64_SVE_VectorCall:
   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
+  case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1:
   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
     if (STI.isTargetWindows())
       return HasReg(CC_AArch64_Win64PCS_ArgRegs, Reg);
diff --git a/llvm/lib/Target/AArch64/AArch64SchedOryon.td b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
new file mode 100644
index 000000000000..09d1af248f0e
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
@@ -0,0 +1,1659 @@
+//=- AArch64SchedOryon.td - Qualcomm Oryon CPU 001 ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for Qualcomm Oryon
+// family of processors.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pipeline Description.
+
+def OryonModel : SchedMachineModel {
+  let IssueWidth            =  14;
+  let MicroOpBufferSize     = 376;
+  let LoadLatency           =   4;
+  let MispredictPenalty     =  13; // 13 cycles for mispredicted branch.
+  let LoopMicroOpBufferSize =   0; // Do not have a LoopMicroOpBuffer
+  let PostRAScheduler       =   1; // Using PostRA sched.
+  let CompleteModel         =   1;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    SMEUnsupported.F,
+                                                    MTEUnsupported.F,
+                                                    PAUnsupported.F,
+                                                    [HasPAuth, HasCSSC]);
+}
+
+let SchedModel = OryonModel in {
+
+// Issue ports.
+// IXU has 6 ports p0 ~ p5
+// LSU has 4 ports p6 ~ p9(ls0 ~ ls3), p10/p11(std0, std1) has to work with ls0~ls3
+// VXU has 4 ports p12 ~ p15
+
+// cross IXU/LSU/VXU resource group for FMOV P41 of VXU
+// I2V
+def ORYONI4FP0 : ProcResource<1>;
+def ORYONI5FP1 : ProcResource<1>;
+// V2I
+def ORYONFP0I4 : ProcResource<1>;
+def ORYONFP1I5 : ProcResource<1>;
+
+// store 1 for normal store instructions
+def ORYONST0 : ProcResource<1>;
+// store 2 for normal store instructions
+def ORYONST1 : ProcResource<1>;
+
+// Port 0: ALU/Indirect/Direct Branch.
+def ORYONP0 : ProcResource<1>;
+
+// Port 1: ALU/Direct Branch.
+def ORYONP1 : ProcResource<1>;
+
+// Port 2: ALU.
+def ORYONP2 : ProcResource<1>;
+
+// Port 3: ALU.
+def ORYONP3 : ProcResource<1>;
+
+// Port 4: ALU.
+def ORYONP4 : ProcResource<1> {
+    let Super = ORYONI4FP0;
+    let Super = ORYONFP0I4; }
+
+// Port 5: ALU.
+def ORYONP5 : ProcResource<1> {
+    let Super = ORYONI5FP1;
+    let Super = ORYONFP1I5; }
+
+// Port 6: Load/Store. LS0
+def ORYONP6 : ProcResource<1> {
+    let Super = ORYONST0; }
+
+// Port 7: Load/store. LS1
+def ORYONP7 : ProcResource<1> {
+    let Super = ORYONST0; }
+
+// Port 8: Load/Store. LS2
+def ORYONP8 : ProcResource<1> {
+    let Super = ORYONST1; }
+
+// Port 9: Load/store. LS3
+def ORYONP9 : ProcResource<1> {
+    let Super = ORYONST1; }
+
+// Port 10: Load/Store. STD0
+def ORYONP10SD0 : ProcResource<1> {
+    let Super = ORYONST0; }
+
+// Port 11: Load/store. STD1
+def ORYONP11SD1 : ProcResource<1> {
+    let Super = ORYONST1; }
+
+// Port 12: FP/Neon/SIMD/Crypto.
+def ORYONP12FP0 : ProcResource<1> {
+    let Super = ORYONI4FP0;
+    let Super = ORYONFP0I4; }
+
+// Port 13: FP/Neon/SIMD/Crypto.
+def ORYONP13FP1 : ProcResource<1> {
+    let Super = ORYONI5FP1;
+    let Super = ORYONFP1I5; }
+
+// Port 14: FP/Neon/SIMD/Crypto.
+def ORYONP14FP2 : ProcResource<1>;
+
+// Port 15: FP/Neon/SIMD/Crypto.
+def ORYONP15FP3 : ProcResource<1>;
+
+// Define groups for the functional units on each issue port.  Each group
+// created will be used by a WriteRes.
+
+// Integer add/shift/logical/misc. instructions on port I0/I1/I2/I3/I4/I5.
+def ORYONI012345 : ProcResGroup<[ORYONP0, ORYONP1, ORYONP2,
+                                  ORYONP3, ORYONP4, ORYONP5]> {
+  let BufferSize = 120;
+}
+
+// Direct Conditional Branch instructions on ports I0/I1.
+def ORYONI01 : ProcResGroup<[ORYONP0, ORYONP1]> {
+  let BufferSize = 40;
+}
+
+// Indirect/crypto Conditional Branch instructions on ports I0.
+def ORYONI0 : ProcResGroup<[ORYONP0]> {
+  let BufferSize = 20;
+}
+
+// Crypto/CRC/PAU instructions on ports I2.
+def ORYONI2 : ProcResGroup<[ORYONP2]> {
+  let BufferSize = 20;
+}
+
+// Multiply/Multiply-ADD instructions on ports I4/I5.
+def ORYONI45 : ProcResGroup<[ORYONP4, ORYONP5]> {
+  let BufferSize = 40;
+}
+
+// Divide instructions on ports I5.
+def ORYONI5 : ProcResGroup<[ORYONP5]> {
+  let BufferSize = 20;
+}
+
+// Comparison instructions on ports I0/I1/I2/I3.
+def ORYONI0123 : ProcResGroup<[ORYONP0, ORYONP1,
+                                ORYONP2, ORYONP3]> {
+  let BufferSize = 80;
+}
+
+// Load instructions on ports P6/P7/P8/P9.
+def ORYONLD : ProcResGroup<[ORYONP6, ORYONP7, ORYONP8, ORYONP9]> {
+  let BufferSize = 64;
+}
+
+// Store instructions on combo of STA/STD pipes
+def ORYONST : ProcResGroup<[ORYONST0, ORYONST1]> {
+    let BufferSize = 64;
+}
+
+// Arithmetic and CRYP-AED ASIMD/FP instructions on ports FP0/FP1/FP2/FP3.
+def ORYONFP0123 : ProcResGroup<[ORYONP12FP0, ORYONP13FP1,
+                                   ORYONP14FP2, ORYONP15FP3]> {
+  let BufferSize = 192;
+}
+
+// FP Comparison and F/I move instructions on ports FP0/FP1.
+def ORYONFP01 : ProcResGroup<[ORYONP12FP0, ORYONP13FP1]> {
+  let BufferSize = 96;
+}
+
+// FDIV instructions on ports FP3.
+def ORYONFP3 : ProcResGroup<[ORYONP15FP3]> {
+  let BufferSize = 48;
+}
+
+// CRYP-SHA instructions on ports FP1.
+def ORYONFP1 : ProcResGroup<[ORYONP14FP2]> {
+  let BufferSize = 48;
+}
+
+def ORYONFP2 : ProcResGroup<[ORYONP14FP2]> {
+  let BufferSize = 48;
+}
+
+// Reciprocal, Squre root on FP0.
+def ORYONFP0 : ProcResGroup<[ORYONP12FP0]> {
+  let BufferSize = 48;
+}
+
+// cross IXU/LSU/VXU resource group for FMOV P41 of VXU
+// I2V
+def ORYONI2V : ProcResGroup<[ORYONI4FP0, ORYONI5FP1]> {
+    let BufferSize = 40;
+}
+
+// V2I
+def ORYONV2I : ProcResGroup<[ORYONFP0I4, ORYONFP1I5]> {
+    let BufferSize = 96;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: ORYONWrite_<NumCycles>Cyc_<Resources>.
+
+// Because of the complexity of Oryon CPU, we skip the following
+// generic definitions and define each instruction specifically
+
+// These WriteRes entries are not used in the Falkor sched model.
+def : WriteRes<WriteImm, []>     { let Unsupported = 1; }
+def : WriteRes<WriteI, []>       { let Unsupported = 1; }
+def : WriteRes<WriteISReg, []>   { let Unsupported = 1; }
+def : WriteRes<WriteIEReg, []>   { let Unsupported = 1; }
+def : WriteRes<WriteExtr, []>    { let Unsupported = 1; }
+def : WriteRes<WriteIS, []>      { let Unsupported = 1; }
+def : WriteRes<WriteID32, []>    { let Unsupported = 1; }
+def : WriteRes<WriteID64, []>    { let Unsupported = 1; }
+def : WriteRes<WriteIM32, []>    { let Unsupported = 1; }
+def : WriteRes<WriteIM64, []>    { let Unsupported = 1; }
+def : WriteRes<WriteBr, []>      { let Unsupported = 1; }
+def : WriteRes<WriteBrReg, []>   { let Unsupported = 1; }
+def : WriteRes<WriteLD, []>      { let Unsupported = 1; }
+def : WriteRes<WriteST, []>      { let Unsupported = 1; }
+def : WriteRes<WriteSTP, []>     { let Unsupported = 1; }
+def : WriteRes<WriteAdr, []>     { let Unsupported = 1; }
+def : WriteRes<WriteLDIdx, []>   { let Unsupported = 1; }
+def : WriteRes<WriteSTIdx, []>   { let Unsupported = 1; }
+def : WriteRes<WriteF, []>       { let Unsupported = 1; }
+def : WriteRes<WriteFCmp, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFCvt, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFCopy, []>   { let Unsupported = 1; }
+def : WriteRes<WriteFImm, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFMul, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFDiv, []>    { let Unsupported = 1; }
+def : WriteRes<WriteVd, []>      { let Unsupported = 1; }
+def : WriteRes<WriteVq, []>      { let Unsupported = 1; }
+def : WriteRes<WriteVLD, []>     { let Unsupported = 1; }
+def : WriteRes<WriteVST, []>     { let Unsupported = 1; }
+def : WriteRes<WriteSys, []>     { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Unsupported = 1; }
+def : WriteRes<WriteHint, []>    { let Unsupported = 1; }
+def : WriteRes<WriteLDHi, []>    { let Unsupported = 1; }
+def : WriteRes<WriteAtomic, []>  { let Unsupported = 1; }
+
+// These ReadAdvance entries will be defined in later implementation
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+def : ReadAdvance<ReadST,      0>;
+
+
+//IXU resource definition
+// 1 cycles NO pipe
+def ORYONWrite_1Cyc_NONE : SchedWriteRes<[]>;
+
+// 1 cycles on I01.
+def ORYONWrite_1Cyc_I01 : SchedWriteRes<[ORYONI01]>;
+
+def ORYONWrite_1Cyc_2Uops_I01 : SchedWriteRes<[ORYONI01]> {
+  let NumMicroOps = 2;
+}
+
+def ORYONWrite_1Cyc_I0 : SchedWriteRes<[ORYONI0]>;
+
+// 7 cycles on I2. PAC*/AUT* instructions
+def ORYONWrite_7Cyc_I2 : SchedWriteRes<[ORYONI2]> {
+  let Latency = 7;
+}
+
+// 7 cycles on I2. PAC*/AUT* instructions
+def ORYONWrite_7Cyc_3Uops_I2 : SchedWriteRes<[ORYONI2]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+
+// 9 (7+1+1) cycles on I2 and I0/I1, I0. Authentication branch instructions
+// these instructions are broken down to three uops
+// a.	PtrAuth on pipe 2 taking 7 cycles
+// b.	Link Register Update on pipes 0 and 1 taking 1 cycle
+// c.	Indirect branch on pipe 0 taking 1 cycle
+
+def ORYONWrite_9Cyc_I012 : SchedWriteRes<[ORYONI2, ORYONI01]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+// 3 cycles on I2. CRC32 and CRC32C instructions
+def ORYONWrite_3Cyc_I2 : SchedWriteRes<[ORYONI2]> {
+  let Latency = 3;
+}
+
+// 1 cycle on I012345
+def ORYONWrite_1Cyc_I012345 : SchedWriteRes<[ORYONI012345]>;
+
+// 1 cycle on I0123
+def ORYONWrite_1Cyc_I0123 : SchedWriteRes<[ORYONI0123]>;
+
+// 1 cycle on 2 of I012345
+def ORYONWrite_1Cyc_I012345_I012345 :
+SchedWriteRes<[ORYONI012345, ORYONI012345]> ;
+
+// 2 cycle on 2 of I0123 with ReleaseAtCycles
+def ORYONWrite_2Cyc_I0123_I0123_RC :
+SchedWriteRes<[ORYONI0123, ORYONI0123]> {
+  let Latency = 2;
+  let ReleaseAtCycles = [2,2];
+}
+
+// 2 cycle on 2 of I012345
+def ORYONWrite_2Cyc_I012345_I012345_RC :
+SchedWriteRes<[ORYONI012345, ORYONI012345]> {
+  let Latency = 2;
+  let ReleaseAtCycles = [2,2];
+}
+
+// 3 cycle on 2 of I45
+def ORYONWrite_3Cyc_I45_I45_RC :
+SchedWriteRes<[ORYONI45, ORYONI45]> {
+  let Latency = 3;
+  let ReleaseAtCycles = [2,2];
+}
+
+// 3 cycle on I45
+def ORYONWrite_3Cyc_I45 : SchedWriteRes<[ORYONI45]> {
+  let Latency = 3;
+}
+
+// 7 cycle on I2 32-bit integer division
+def ORYONWrite_7Cyc_I2_RC : SchedWriteRes<[ORYONI2]> {
+  let Latency = 7;
+  let ReleaseAtCycles = [2];
+}
+
+// 9 cycle on I2 64-bit integer division
+def ORYONWrite_9Cyc_I2_RC : SchedWriteRes<[ORYONI2]> {
+  let Latency = 9;
+  let ReleaseAtCycles = [2];
+}
+
+// LSU resource definition
+// need to define WriteLDAdr, WriteAdrAdr, WriteLDHi, WriteSTX
+// 4 cycle on LS(P6789)
+def ORYONWrite_4Cyc_LD : SchedWriteRes<[ORYONLD]> {
+  let Latency = 4;
+}
+
+// 4 cycle for Post/Pre inc/dec access, also covers all pair loads Post/Pre
+def ORYONWrite_4Cyc_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+  let Latency = 4;
+}
+
+// 5 (4+1) for VXU SIMD access/could also include FP
+// resource might not be correct, as VXU resource not included
+def ORYONWrite_5Cyc_LD : SchedWriteRes<[ORYONLD]> {
+  let Latency = 5;
+}
+
+def ORYONWrite_5Cyc_2Uops_LD : SchedWriteRes<[ORYONLD]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def ORYONWrite_5Cyc_3Uops_LD : SchedWriteRes<[ORYONLD]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+def ORYONWrite_5Cyc_4Uops_LD : SchedWriteRes<[ORYONLD]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+}
+
+def ORYONWrite_5Cyc_5Uops_LD : SchedWriteRes<[ORYONLD]> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+}
+
+def ORYONWrite_5Cyc_6Uops_LD : SchedWriteRes<[ORYONLD]> {
+  let Latency = 5;
+  let NumMicroOps = 6;
+}
+
+def ORYONWrite_5Cyc_8Uops_LD : SchedWriteRes<[ORYONLD]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+}
+
+def ORYONWrite_5Cyc_10Uops_LD : SchedWriteRes<[ORYONLD]> {
+  let Latency = 5;
+  let NumMicroOps = 10;
+}
+
+// 6 cycle for Post/Pre inc/dec access
+def ORYONWrite_5Cyc_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+  let Latency = 5;
+}
+
+def ORYONWrite_5Cyc_2Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def ORYONWrite_5Cyc_3Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+def ORYONWrite_5Cyc_4Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+}
+
+def ORYONWrite_5Cyc_5Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+}
+
+def ORYONWrite_5Cyc_6Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+  let Latency = 5;
+  let NumMicroOps = 6;
+}
+
+def ORYONWrite_5Cyc_8Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+}
+
+def ORYONWrite_5Cyc_10Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> {
+  let Latency = 5;
+  let NumMicroOps = 10;
+}
+
+// 1 cycle for all generic stores
+def ORYONWrite_1Cyc_ST : SchedWriteRes<[ORYONST]>;
+
+def ORYONWrite_1Cyc_2Uops_ST : SchedWriteRes<[ORYONST]> {
+  let NumMicroOps = 2;
+}
+
+def ORYONWrite_1Cyc_3Uops_ST : SchedWriteRes<[ORYONST]> {
+  let NumMicroOps = 3;
+}
+
+def ORYONWrite_1Cyc_4Uops_ST : SchedWriteRes<[ORYONST]> {
+  let NumMicroOps = 4;
+}
+
+def ORYONWrite_1Cyc_5Uops_ST : SchedWriteRes<[ORYONST]> {
+  let NumMicroOps = 5;
+}
+
+def ORYONWrite_1Cyc_6Uops_ST : SchedWriteRes<[ORYONST]> {
+  let NumMicroOps = 6;
+}
+
+def ORYONWrite_1Cyc_8Uops_ST : SchedWriteRes<[ORYONST]> {
+  let NumMicroOps = 8;
+}
+
+def ORYONWrite_1Cyc_10Uops_ST : SchedWriteRes<[ORYONST]> {
+  let NumMicroOps = 10;
+}
+
+// 1 cycle for neon write: float + ASIMD with Post/Pre Inc/Dec access
+// also includes Pair store until further informed
+def ORYONWrite_1Cyc_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+  let NumMicroOps = 3;
+}
+
+def ORYONWrite_1Cyc_2Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+  let NumMicroOps = 2;
+}
+
+def ORYONWrite_1Cyc_3Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+  let NumMicroOps = 3;
+}
+
+def ORYONWrite_1Cyc_4Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+  let NumMicroOps = 4;
+}
+
+def ORYONWrite_1Cyc_5Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+  let NumMicroOps = 5;
+}
+
+def ORYONWrite_1Cyc_6Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+  let NumMicroOps = 6;
+}
+
+def ORYONWrite_1Cyc_8Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+  let NumMicroOps = 8;
+}
+
+def ORYONWrite_1Cyc_10Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> {
+  let NumMicroOps = 10;
+}
+
+// VXU resource definition
+
+// I2V instruction has 1 uOp
+// I2v with convert has 2 uOps
+// all I2V, V2I's throughputs are 2
+// On VXU doc, p37 -- latencies and throughput
+// P41, resource taken, P42, uOps
+def ORYONWrite_I2V_4Cyc_I45 : SchedWriteRes<[ORYONI2V]> {
+  let Latency = 4;
+}
+
+// inline a FCVT, so add one more uOp
+def ORYONWrite_I2V_7Cyc_I45 : SchedWriteRes<[ORYONI2V]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+// V2I move instruction has 1/2 uOps, P42 in VXU doc
+// Latency is 3, FCVT is also 3 cycle
+// move + convert is 6 (3+3) cycles
+// throughput is 2
+def ORYONWrite_V2I_3Cyc_FP01 : SchedWriteRes<[ORYONV2I]> {
+  let Latency = 3;
+}
+
+// inline a FCVT, so add one more uOp
+def ORYONWrite_V2I_6Cyc_FP01 : SchedWriteRes<[ORYONV2I]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def ORYONWrite_V2V_2Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+  let Latency = 2;
+}
+
+def ORYONWrite_V2V_3Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+  let Latency = 3;
+}
+
+def ORYONWrite_V2V_6Cyc_FP01 : SchedWriteRes<[ORYONFP0123]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def ORYONWrite_4Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+  let Latency = 4;
+}
+
+def ORYONWrite_3Cyc_FP0 : SchedWriteRes<[ORYONFP0]> {
+  let Latency = 3;
+}
+
+def ORYONWrite_3Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+  let Latency = 3;
+}
+
+def ORYONWrite_3Cyc_2Uops_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def ORYONWrite_2Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> {
+  let Latency = 2;
+}
+
+def ORYONWrite_2Cyc_FP01 : SchedWriteRes<[ORYONFP01]> {
+  let Latency = 2;
+}
+
+// 2 cycle on FP1
+def ORYONWrite_2Cyc_FP1 : SchedWriteRes<[ORYONFP1]> {
+  let Latency = 2;
+}
+
+// 3 cycle on FP1
+def ORYONWrite_3Cyc_FP1 : SchedWriteRes<[ORYONFP1]> {
+  let Latency = 3;
+}
+
+// 4 cycle , 0.5 throughput on FP1
+def ORYONWrite_4Cyc_FP1_RC4 : SchedWriteRes<[ORYONFP1]> {
+  let Latency = 4;
+  let ReleaseAtCycles = [4];
+}
+
+// 5 cycle , 1 throughput on FP1
+def ORYONWrite_5Cyc_FP1 : SchedWriteRes<[ORYONFP1]> {
+  let Latency = 5;
+}
+
+// 8 cycle , 2 throughput on FP0123
+def ORYONWrite_8Cyc_FP0123_RC : SchedWriteRes<[ORYONFP0123]> {
+  let Latency = 8;
+  let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_6Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+  let Latency = 6;
+}
+
+def ORYONWrite_7Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+  let Latency = 7;
+}
+
+def ORYONWrite_8Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+  let Latency = 8;
+}
+
+def ORYONWrite_9Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+  let Latency = 9;
+}
+
+def ORYONWrite_10Cyc_FP3 : SchedWriteRes<[ORYONFP3]> {
+  let Latency = 10;
+}
+
+def ORYONWrite_8Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> {
+  let Latency = 8;
+  let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_10Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> {
+  let Latency = 10;
+  let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_13Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> {
+  let Latency = 13;
+  let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_4Cyc_FP0123_RC :
+SchedWriteRes<[ORYONFP0123]> {
+  let Latency = 4;
+  let ReleaseAtCycles = [2];
+}
+
+def ORYONWrite_4Cyc_FP0123_FP0123_RC :
+SchedWriteRes<[ORYONFP0123, ORYONFP0123]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ReleaseAtCycles = [2,2];
+}
+
+def ORYONWrite_4Cyc_FP0123_FP0123_FP0123_RC :
+SchedWriteRes<[ORYONFP0123, ORYONFP0123, ORYONFP0123]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ReleaseAtCycles = [3,3,3];
+}
+
+def ORYONWrite_6Cyc_FP0123_FP0123_FP0123_FP0123_RC :
+SchedWriteRes<[ORYONFP0123, ORYONFP0123, ORYONFP0123, ORYONFP0123]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+  let ReleaseAtCycles = [6,6,6,6];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Tables in IXU
+//===----------------------------------------------------------------------===//
+
+//---
+// Arithmetic Instructions
+//---
+
+//1, 1, 6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+            (instregex "^ADD(W|X)r(i|r|x)", "^SUB(W|X)r(i|r|x)")>;
+
+//2,2,3
+def : InstRW<[ORYONWrite_2Cyc_I012345_I012345_RC],
+            (instregex "^ADD(W|X)rs", "^SUB(W|X)rs")>;
+
+//1,1,4 alias CMP, CMN on page 75
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+            (instregex "^ADDS(W|X)r(i|r|x)(64)?", "^SUBS(W|X)r(i|r|x)")>;
+
+//2,2,2 alias CMP, CMN on page 75
+def : InstRW<[ORYONWrite_2Cyc_I0123_I0123_RC],
+            (instregex "^ADDS(W|X)rs", "^SUBS(W|X)rs")>;
+
+//1,1,4
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+            (instregex "^ADC(W|X)r","^SBC(W|X)r",
+                       "^ADCS(W|X)r","^SBCS(W|X)r")>;
+
+//1,1,2
+def : InstRW<[ORYONWrite_1Cyc_2Uops_I01],
+            (instrs ADR,ADRP)>;
+
+//1,1,4
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+            (instregex "^CSEL(W|X)r", "^CSINV(W|X)r",
+                       "^CSNEG(W|X)r", "^CSINC(W|X)r")>;
+
+//---
+//Compare Instruciton
+//---
+
+// We have CCMP, CCMN as LLVM DAG node
+// CMP is an alias of SUBS as above
+// CMN is an alias of ADDS as above
+// We also have no way to get shift compare node in LLVM
+//2,2,1.5 CMP, CMN
+
+//1,1,4
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+            (instregex "^CCMP(W|X)(i|r)", "^CCMN(W|X)(i|r)")>;
+
+//---
+// Branch
+//---
+
+def : InstRW<[ORYONWrite_1Cyc_NONE], (instrs B)>;
+def : InstRW<[ORYONWrite_1Cyc_I01], (instrs BL)>;
+def : InstRW<[ORYONWrite_1Cyc_I01],
+            (instrs Bcc, CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+def : InstRW<[ORYONWrite_1Cyc_I0], (instrs BR, BLR)>;
+def : InstRW<[ORYONWrite_1Cyc_I0], (instrs RET)>;
+
+// 3 uOp, 1 cycle for branch, 7 cycle for Authentication,
+// 1 cycle for updating link register
+// V8.3a PAC
+def : InstRW<[ORYONWrite_9Cyc_I012],
+            (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ,
+                    BRAA, BRAAZ, BRAB, BRABZ)>;
+def : InstRW<[ORYONWrite_9Cyc_I012], (instrs RETAA, RETAB, ERETAA, ERETAB)>;
+
+def : InstRW<[ORYONWrite_7Cyc_3Uops_I2], (instregex "^LDRAA", "^LDRAB")>;
+
+// Logical Instructions
+//---
+
+//1,1,4 TST is an alias of ANDS
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+            (instregex "^ANDS(W|X)r(i|r|x)", "^BICS(W|X)r(i|r|x)")>;
+
+//2,2,2 TST shift is an alias
+def : InstRW<[ORYONWrite_2Cyc_I0123_I0123_RC],
+            (instregex "^ANDS(W|X)rs", "^BICS(W|X)rs")>;
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+            (instregex "^AND(W|X)r(i|r|x)", "^EOR(W|X)r(i|r|x)",
+                       "^ORR(W|X)r(i|r|x)", "^BIC(W|X)r(i|r|x)",
+                       "^EON(W|X)r(i|r|x)", "^ORN(W|X)r(i|r|x)")>;
+
+//2,2,3
+def : InstRW<[ORYONWrite_2Cyc_I012345_I012345_RC],
+            (instregex "^AND(W|X)rs", "^EOR(W|X)rs", "^ORR(W|X)rs",
+                       "^BIC(W|X)rs", "^EON(W|X)rs", "^ORN(W|X)rs")>;
+
+
+//---
+// Shift Instructions
+//---
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+            (instregex "^ASRV(W|X)r", "^LSLV(W|X)r",
+                       "^LSRV(W|X)r", "^RORV(W|X)r",
+                       "RMIF")>;
+
+//---
+// Move-Data Bit-field and Sign_Extension Instructions
+//---
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+            (instregex "^MOVK(W|X)i", "^MOVN(W|X)i",
+                       "^MOVZ(W|X)i", "^SBFM(W|X)ri",
+                       "^UBFM(W|X)ri", "^BFM(W|X)ri",
+                       "^SXT(W|B|H|X)", "^UXT(H|B)")>;
+
+// COPY instruction is an LLVM internal DAG node, needs further study
+def : InstRW<[ORYONWrite_1Cyc_I012345], (instrs COPY)>;
+
+//---
+// Reverse Instructions
+//---
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+            (instregex "^RBIT(W|X)r", "^REV(16|32|64)?(W|X)r")>;
+
+
+//---
+// Flag Manipulate Instructions
+//---
+
+//1,1,4
+def : InstRW<[ORYONWrite_1Cyc_I0123],
+            (instregex "^SETF8", "^SETF16", "^CFINV")>;
+
+//---
+// Miscellaneous Instructions
+//---
+
+//1,1,6
+def : InstRW<[ORYONWrite_1Cyc_I012345],
+              (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$", "^EXTR(W|X)rri")>;
+
+
+//---
+// Multiply Instructions
+//---
+
+//1,3,2
+def : InstRW<[ORYONWrite_3Cyc_I45],
+            (instregex "^MADD(W|X)rrr", "^MSUB(W|X)rrr",
+                       "^(S|U)MADDLrrr", "^(S|U)MSUBLrrr",
+                       "^(S|U)MULHrr")>;
+
+//---
+// Divide Instructions
+//---
+
+def : InstRW<[ORYONWrite_7Cyc_I2_RC],
+             (instregex "^(S|U)DIVWr")>;
+
+def : InstRW<[ORYONWrite_9Cyc_I2_RC],
+             (instregex "^(S|U)DIVXr")>;
+
+
+//---
+// Cryptgraphy Instructions
+//
+//1,3,1  on I2
+def : InstRW<[ORYONWrite_3Cyc_I2],
+            (instregex "^CRC32(B|H|W|X)rr", "^CRC32C(B|H|W|X)rr")>;
+
+//---
+// PAU instructions
+//---
+
+// on p47 of IXU document, we have 7 cycles for all PAU instructions
+// here we just assume all signing and pauth instructions are 7 cycles
+// assume all are 7 cycles here
+
+// signing instrucitons
+def : InstRW<[ORYONWrite_7Cyc_I2], (instrs PACIA, PACIB,
+                                            PACDA, PACDB,
+                                            PACIZA, PACIZB,
+                                            PACDZA, PACDZB,
+                                            PACGA)>;
+// authentication instrucitons
+def : InstRW<[ORYONWrite_7Cyc_I2], (instrs AUTIA, AUTIB,
+                                            AUTDA, AUTDB,
+                                            AUTIZA, AUTIZB,
+                                            AUTDZA, AUTDZB)>;
+def : InstRW<[ORYONWrite_7Cyc_I2], (instrs XPACI, XPACD)>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Tables in LSU
+//===----------------------------------------------------------------------===//
+
+// 4 cycle Load-to-use from L1D$
+// Neon load with 5 cycle
+// 6 cycle to STA ?
+// STD cycle ?
+// NEON STD + 2
+
+// Load Instructions
+// FP Load Instructions
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPDi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPQi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPSi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPXi)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPDi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPQi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPSi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPSWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPXi)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBui)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDui)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHui)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQui)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSui)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDl)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQl)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWl)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXl)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRBi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRHi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRXi)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSBWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSBXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSHWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSHXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSWi)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+            (instrs LDPDpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+            (instrs LDPQpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+            (instrs LDPSpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+            (instrs LDPWpre)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRDpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRQpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRWpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRXpre)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBWpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBXpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBWpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBXpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHWpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHXpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHWpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHXpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBBpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBBpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHHpre)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHHpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+            (instrs LDPDpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+            (instrs LDPQpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+            (instrs LDPSpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+            (instrs LDPWpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345],
+            (instrs LDPXpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRDpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRQpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRWpost)>;
+def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRXpost)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHHroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHWroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHXroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWroW)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXroW)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHHroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHWroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHXroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWroX)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXroX)>;
+
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURBi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURBBi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURDi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURHi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURHHi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURQi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSBWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSBXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSHWi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSHXi)>;
+def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSWi)>;
+
+
+
+// Store register, immed post-index
+// NOTE: Handled by WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteST
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-indx, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteSTP.
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURBi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURBBi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURDi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURHi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURHHi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURQi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURSi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURWi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURXi)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRBi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRHi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRWi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRXi)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPDi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPQi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPXi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPWi)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPDi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPQi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPXi)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPWi)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRBui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRDui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRHui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRQui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRXui)>;
+def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRWui)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STPXpre, STPXpost)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instrs STRXpre, STRXpost)>;
+
+def : InstRW<[ORYONWrite_1Cyc_ST],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[ORYONWrite_1Cyc_ST],
+            (instrs STRXroW, STRXroX)>;
+
+// ASIMD Load instructions, 4 cycle access + 2 cycle NEON access
+// ASIMD load, 1 element, multiple, 1 reg, D-form 1uOps
+// ASIMD load, 1 element, multiple, 1 reg, Q-form 1uOps
+def : InstRW<[ORYONWrite_5Cyc_LD],
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_LD_I012345],
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form 3 uOps
+// ASIMD load, 1 element, multiple, 2 reg, Q-form 2 uOps
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD],
+            (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD],
+            (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+            (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345],
+            (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form 4 uOps
+// ASIMD load, 1 element, multiple, 3 reg, Q-form 3 uOps
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD],
+            (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD],
+            (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+            (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+            (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form 6 uOps
+// ASIMD load, 1 element, multiple, 4 reg, Q-form 4 uOps
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD],
+            (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD],
+            (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+            (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+            (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S 2uOps
+// ASIMD load, 1 element, one lane, D     2UOps
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345],
+            (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S 2uOps
+// ASIMD load, 1 element, all lanes, D-form, D     2uOps
+// ASIMD load, 1 element, all lanes, Q-form        2uOps
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD],
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345],
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S 3 uOps
+// ASIMD load, 2 element, multiple, Q-form, D     4 uOps
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD],
+            (instregex "^LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD],
+            (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+            (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+            (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H           3 uOps
+// ASIMD load, 2 element, one lane, S             3 uOps
+// ASIMD load, 2 element, one lane, D             3 uOps
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+            (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S 3 uOps
+// ASIMD load, 2 element, all lanes, D-form, D     3 uOps
+// ASIMD load, 2 element, all lanes, Q-form        3 uOps
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD],
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345],
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S  5 uOps
+// ASIMD load, 3 element, multiple, Q-form, B/H/S  6 uOps
+// ASIMD load, 3 element, multiple, Q-form, D      6 uOps
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD],
+            (instregex "^LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD],
+            (instregex "^LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+            (instregex "^LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+            (instregex "^LD3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H            4 uOps
+// ASIMD load, 3 element, one lane, S              4 uOps
+// ASIMD load, 3 element, one lane, D              5 uOps
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], (instregex "^LD3i(8|16|32)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], (instregex "^LD3i(64)$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+            (instregex "^LD3i(8|16|32)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+            (instregex "^LD3i(64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S 4 uOps
+// ASIMD load, 3 element, all lanes, D-form, D     5 uOps
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S 4 uOps
+// ASIMD load, 3 element, all lanes, Q-form, D     5 uOps
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD],
+            (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD],
+            (instregex "^LD3Rv(1d|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345],
+            (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+            (instregex "^LD3Rv(1d|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S  6 uOps
+// ASIMD load, 4 element, multiple, Q-form, B/H/S  10 uOps
+// ASIMD load, 4 element, multiple, Q-form, D      8 uOps
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD],
+            (instregex "^LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_10Uops_LD],
+            (instregex "^LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_8Uops_LD],
+            (instregex "^LD4Fourv(2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+            (instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_10Uops_LD_I012345],
+            (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_8Uops_LD_I012345],
+            (instregex "^LD4Fourv(2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H            5 uOps
+// ASIMD load, 4 element, one lane, S              5 uOps
+// ASIMD load, 4 element, one lane, D              6 uOps
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], (instregex "^LD4i(64)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+            (instregex "^LD4i(8|16|32)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+            (instregex "^LD4i(64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S    5 uOps
+// ASIMD load, 4 element, all lanes, D-form, D        6 uOps
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S    5 uOps
+// ASIMD load, 4 element, all lanes, Q-form, D        6 uOps
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD],
+            (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD],
+            (instregex "^LD4Rv(1d|2d)$")>;
+def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345],
+            (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s)_POST$")>;
+def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345],
+            (instregex "^LD4Rv(1d|2d)_POST$")>;
+
+// ASIMD Store Instructions
+// ASIMD store, 1 element, multiple, 1 reg, D-form    1 uOps
+// ASIMD store, 1 element, multiple, 1 reg, Q-form    1 uops
+def : InstRW<[ORYONWrite_1Cyc_ST],
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_ST_I012345],
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form    2 uOps
+// ASIMD store, 1 element, multiple, 2 reg, Q-form    2 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST],
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form    3 uOps
+// ASIMD store, 1 element, multiple, 3 reg, Q-form    3 uOps
+def : InstRW<[ORYONWrite_1Cyc_3Uops_ST],
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_3Uops_ST_I012345],
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form    4 uOps
+// ASIMD store, 1 element, multiple, 4 reg, Q-form    4 uOps
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST],
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S            2 uOps
+// ASIMD store, 1 element, one lane, D                2 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST],
+            (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+            (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S    2 uOps
+// ASIMD store, 2 element, multiple, Q-form, B/H/S    4 uOps
+// ASIMD store, 2 element, multiple, Q-form, D        4 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST],
+            (instregex "^ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST],
+            (instregex "^ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+            (instregex "^ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+            (instregex "^ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S            2 uOps
+// ASIMD store, 2 element, one lane, D                2 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST],
+            (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+            (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S    4 uOps
+// ASIMD store, 3 element, multiple, Q-form, B/H/S    6 uOps
+// ASIMD store, 3 element, multiple, Q-form, D        6 uOps
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST],
+            (instregex "^ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_1Cyc_6Uops_ST],
+            (instregex "^ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+            (instregex "^ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_6Uops_ST_I012345],
+            (instregex "^ST3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H              2 uOps
+// ASIMD store, 3 element, one lane, S                2 uOps
+// ASIMD store, 3 element, one lane, D                4 uOps
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], (instregex "^ST3i(8|16|32)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], (instregex "^ST3i(64)$")>;
+def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345],
+            (instregex "^ST3i(8|16|32)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+            (instregex "^ST3i(64)_POST$")>;
+
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S    5 uOps
+// ASIMD store, 4 element, multiple, Q-form, B/H/S    10 uOps
+// ASIMD store, 4 element, multiple, Q-form, D        8 uOps
+def : InstRW<[ORYONWrite_1Cyc_5Uops_ST],
+            (instregex "^ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[ORYONWrite_1Cyc_10Uops_ST],
+            (instregex "^ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[ORYONWrite_1Cyc_8Uops_ST],
+            (instregex "^ST4Fourv(2d)$")>;
+def : InstRW<[ORYONWrite_1Cyc_5Uops_ST_I012345],
+            (instregex "^ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_10Uops_ST_I012345],
+            (instregex "^ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_8Uops_ST_I012345],
+            (instregex "^ST4Fourv(2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H              3 uOps
+// ASIMD store, 4 element, one lane, S                3 uOps
+// ASIMD store, 4 element, one lane, D                4 uOps
+def : InstRW<[ORYONWrite_1Cyc_3Uops_ST], (instregex "^ST4i(8|16|32)$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], (instregex "^ST4i(64)$")>;
+def : InstRW<[ORYONWrite_1Cyc_3Uops_ST_I012345],
+            (instregex "^ST4i(8|16|32)_POST$")>;
+def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345],
+            (instregex "^ST4i(64)_POST$")>;
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Tables in VXU
+//===----------------------------------------------------------------------===//
+// all uOps are not clearly written in the VXU document
+
+// I2V
+def : InstRW<[ORYONWrite_I2V_4Cyc_I45], (instregex "^FMOV[HSD][WX]r", "^FMOVDXHighr")>;
+
+// I2V with convert
+def : InstRW<[ORYONWrite_I2V_7Cyc_I45], (instregex "^[SU]CVTF[SU][XW][HSD]ri")>;
+
+// V2I
+def : InstRW<[ORYONWrite_V2I_3Cyc_FP01], (instregex "^FMOV[WX][HSD]r", "FMOVXDHighr")>;
+
+// V2I with convert 2nd [SU] necessary?
+def : InstRW<[ORYONWrite_V2I_6Cyc_FP01], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
+
+// float to float move immediate, row 7 in big chart
+def : InstRW<[ORYONWrite_V2V_2Cyc_FP0123], (instregex "^FMOV[HSD]r")>;
+def : InstRW<[ORYONWrite_V2V_2Cyc_FP0123], (instregex "^FMOV[HSD]i")>;
+
+// float to float conversion within VXU, precision conversion
+def : InstRW<[ORYONWrite_V2V_6Cyc_FP01], (instregex "^FJCVTZS")>;
+def : InstRW<[ORYONWrite_V2V_3Cyc_FP0123], (instregex "^FCVT[HSD][HSD]r",
+                                                       "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// floating comparison write to NZCV
+def : InstRW<[ORYONWrite_2Cyc_FP01], (instregex "^FCMP(E)?[HSD]r[ir]")>;
+def : InstRW<[ORYONWrite_2Cyc_FP01], (instregex "^FCCMP(E)?[HSD]rr")>;
+
+// floating point conditional select
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^FCSEL")>;
+
+// floating multiply-add
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^(F|FN)MADD", "^(F|FN)MSUB")>;
+
+// floating unary, cycle/throughput? xls row14
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^F(ABS|NEG)[SD]r")>;
+
+//floating division/square root
+def : InstRW<[ORYONWrite_7Cyc_FP3], (instregex "^FDIVHrr")>;
+def : InstRW<[ORYONWrite_8Cyc_FP3], (instregex "^FDIVSrr")>;
+def : InstRW<[ORYONWrite_10Cyc_FP3], (instregex "^FDIVDrr")>;
+
+def : InstRW<[ORYONWrite_8Cyc_FP3_RC], (instregex "^FSQRTHr")>;
+def : InstRW<[ORYONWrite_10Cyc_FP3_RC], (instregex "^FSQRTSr")>;
+def : InstRW<[ORYONWrite_13Cyc_FP3_RC], (instregex "^FSQRTDr")>;
+
+//==========
+// SIMD move instructions
+//==========
+
+// ASIMD DUP element
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^DUPv.+lane")>;
+// ASIMD DUP general thoughput undecided, 3? FP0123
+// VXU doc, p42, 2 uOps
+def : InstRW<[ORYONWrite_3Cyc_2Uops_FP0123], (instregex "^DUPv.+gpr")>;
+
+// ASIMD insert, element to element
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^INSv.+lane")>;
+// ASIMD insert,  gen reg 3? FP0123?
+def : InstRW<[ORYONWrite_3Cyc_2Uops_FP0123], (instregex "^INSv.+gpr")>;
+
+// ASIMD move, FP immed
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^FMOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^[SU]MOVv")>;
+
+//==========
+// SIMD arithmetic instructions
+//==========
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDv", "^SUBv",
+                                         "^BIFv", "^BITv", "^BSLv",
+                                         "^ANDv", "^BICv", "^EORv",
+                                         "^ORRv", "^ORNv")>;
+
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// floating division
+def : InstRW<[ORYONWrite_6Cyc_FP3], (instregex "^FDIVv.*16$")>;
+def : InstRW<[ORYONWrite_7Cyc_FP3], (instregex "^FDIVv.*32$")>;
+def : InstRW<[ORYONWrite_9Cyc_FP3], (instregex "^FDIVv.*64$")>;
+
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FMUL(X)?v",
+                                                   "^FRECPSv", "^FRSQRTSv")>;
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^MLAv","^MLSv", "^MULv",
+                                                   "^PMULv", "UABAv")>;
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "SABAv", "SABDv",
+                                                   "^(SH|UH)(ADD|SUB)v",
+                                                   "^S(MAX|MIN)v",
+                                                   "^(SQ|UQ)(ADD|SUB)v",
+                                                   "^(SQ|SQR|UQ|UQR)SHLv",
+                                                   "^(SR|UR)HADDv",
+                                                   "^(SR|UR)SHLv",
+                                                   "^UABDv",
+                                                   "^U(MAX|MIN)v")>;
+// IMAX or UMAX in the above line
+//==========
+// SIMD compare instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^CMEQv","^CMGEv","^CMGTv",
+                                                   "^CMLEv","^CMLTv", "^CMHIv",
+                                                   "^CMHSv",
+                                                   "^FCMEQv", "^FCMGEv",
+                                                   "^FCMGTv", "^FCMLEv",
+                                                   "^FCMLTv",
+                                                   "^FACGEv", "^FACGTv")>;
+
+//==========
+// SIMD widening and narrowing arithmetic instructions
+//==========
+// NO need to list ADDHN2, RADDHN2, RSUBHN2 as they are not distinguished
+// from ADDHN, RADDHN, RSUBHN in td file(v16i8, v8i16, v4i32).
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDHNv",
+                                                   "^SUBHNv",
+                                                   "^RADDHNv",
+                                                   "^RSUBHNv",
+                                                   "^SABD(L|L2)v", "^UABD(L|L2)v",
+                                                   "^(S|U)(ADD|SUB)(L|L2|W|W2)v")>;
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^PMUL(L|L2)v","^SABA(L|L2)v",
+                                                   "^(S|U|SQ)(MLA|MSL|MUL)(L|L2)v")>;
+
+//==========
+// SIMD unary arithmetic instructions
+//==========
+//^MVNv is an alias of ^NOTv
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ABSv", "^CLSv","^CLZv", "^CNTv",
+                                                   "^NEGv", "^NOTv",
+                                                   "^RBITv", "^REV(16|32|64)v",
+                                                   "^SQ(ABS|NEG)v", "^SQ(XT|XTU)(N|N2)v",
+                                                   "^(SU|US)QADDv",
+                                                   "^UQXT(N|N2)v", "^XTN2?v")>;
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^FCVT(L|L2|N|N2|XN|XN2)v",
+                                                   "^FRINT[AIMNPXZ]v",
+                                                   "^FRSQRTEv",
+                                                   "^(S|U)ADALPv",
+                                                   "^(S|U)ADDLPv")>;
+
+
+def : InstRW<[ORYONWrite_3Cyc_FP0], (instregex "^URECPEv", "^URSQRTEv",
+                                                "^FRECPEv", "^FRECPXv")>;
+
+def : InstRW<[ORYONWrite_8Cyc_FP3_RC], (instregex "^FSQRTv.*16$")>;
+def : InstRW<[ORYONWrite_10Cyc_FP3_RC], (instregex "^FSQRTv.*32$")>;
+def : InstRW<[ORYONWrite_13Cyc_FP3_RC], (instregex "^FSQRTv.*64$")>;
+
+//==========
+// SIMD binary elememt arithmetic instructions
+//==========
+
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FMLAv", "^FMLSv")>;
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex  "^SQDMULHv",
+                                                   "^SQRD(MLA|MLS|MUL)Hv")>;
+
+//==========
+// SIMD permute instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^EXTv", "^TRN(1|2)v",
+                                                   "^UZP(1|2)v", "^ZIP(1|2)v")>;
+
+//==========
+// SIMD immediate instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex  "^MOVIv", "^MVNIv")>;
+
+//==========
+// SIMD shift(immediate) instructions
+//==========
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^RSHR(N|N2)v", "^SHLv",
+                                                   "^(SHL|SHR)(N|N2)v",
+                                                   "^SLIv",
+                                                   "^(SQ|SQR)SHR(U)?(N|N2)v",
+                                                   "^(UQ|UQR)SHR(N|N2)v",
+                                                   "^SQSHLUv",
+                                                   "^SRIv",
+                                                   "^(S|SR|U|UR)SHRv",
+                                                   "^(S|SR|U|UR)SRAv",
+                                                   "^(S|U)SHL(L|L2)v")>;
+
+//==========
+// SIMD floating-point and integer conversion instructions
+//==========
+// same as above conversion
+
+//==========
+// SIMD reduce (acoss vector lanes) instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDVv",
+                                                   "^(FMAX|FMIN)(V|NMV)v",
+                                                   "^(S|U)ADDLVv",
+                                                   "^(S|U)(MAX|MIN)Vv")>;
+//==========
+// SIMD pairwise arithmetic instructions
+//==========
+
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDPv", "^FADDPv",
+                                                   "^(FMAX|FMIN)(NMP|P)v",
+                                                   "^(S|U)(MIN|MAX)Pv")>;
+//==========
+// SIMD dot prodcut instructions
+//==========
+
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^(U|S)DOTv")>;
+
+//==========
+// SIMD table lookup instructions
+//==========
+// TBL 1-reg/2-reg; TBX 1-reg, 1uOp, throughput=4 latency=2
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instrs TBLv8i8One, TBLv16i8One,
+                                                TBXv8i8One, TBXv16i8One,
+                                                TBLv8i8Two, TBLv16i8Two)>;
+
+// TBL 3-reg/4-reg, 3uops, throughtput=4/3=1.33 latency=4
+def : InstRW<[ORYONWrite_4Cyc_FP0123_FP0123_FP0123_RC],
+            (instrs TBLv8i8Three, TBLv16i8Three,
+                    TBLv8i8Four, TBLv16i8Four)>;
+
+
+// TBX 2-reg 2 uOps, throughput=2 latency=4
+def : InstRW<[ORYONWrite_4Cyc_FP0123_FP0123_RC], (instrs TBXv8i8Two, TBXv16i8Two)>;
+
+// TBX 3-reg/4-reg, 4uOps, throughput=1, latency=6
+def : InstRW<[ORYONWrite_6Cyc_FP0123_FP0123_FP0123_FP0123_RC],
+            (instrs TBXv8i8Three, TBXv16i8Three,
+                    TBXv8i8Four, TBXv16i8Four)>;
+
+
+//==========
+// SIMD complex number arithmetic instructions
+//==========
+
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FCADDv", "^FCMLAv")>;
+
+//==========
+// SIMD cryptographic instructions
+//==========
+// 3,4 on IMLA, CRYP
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^AES[DE]",
+                                                   "^SM3(TT1|TT2)(A|B)")>;
+
+// 2,4 on CRYP
+def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^AESI?MC",
+                                                   "^EOR3",
+                                                   "^RAX1",
+                                                   "^XAR",
+                                                   "^BCAX",
+                                                   "^SM3SS1",
+                                                   "^SM3PART(W1|W2)")>;
+// 5,1 on CRYP
+def : InstRW<[ORYONWrite_5Cyc_FP1], (instregex "^SM4E",
+                                                "^SM4EKEY")>;
+
+// 2,1 on CRYP
+def : InstRW<[ORYONWrite_2Cyc_FP1], (instregex "^SHA1(H|SU0|SU1)",
+                                                "^SHA256SU0",
+                                                "^SHA512(SU0|SU1)")>;
+
+// 3,1 on CRYP
+def : InstRW<[ORYONWrite_3Cyc_FP1], (instregex "^SHA256SU1",
+                                                "^SHA512(H|H2)")>;
+
+// 4,0.25 on CRYP
+def : InstRW<[ORYONWrite_4Cyc_FP1_RC4], (instregex "^SHA1(C|P|M)",
+                                                "^SHA256(H|H2)")>;
+
+//==========
+// SIMD v8.6 instructions
+//==========
+// 4,2 on IMLA
+def : InstRW<[ORYONWrite_4Cyc_FP0123_RC], (instregex "^(S|U|US)MMLA$")>;
+
+// 4,0.5 on IMLA
+def : InstRW<[ORYONWrite_8Cyc_FP0123_RC], (instregex "^BFMMLA$")>;
+
+// 4,0.5 on IMLA
+def : InstRW<[ORYONWrite_8Cyc_FP0123_RC], (instregex "^BFMLAL(B|T)")>;
+
+// 3,4
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^(US|SU)DOTv")>;
+
+// 3,1
+def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^BF(16)?DOTv")>;
+
+// 3,4
+def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^BFCVT(N|N2)?$")>;
+
+
+} // SchedModel = OryonModel
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 8bc26eeef34d..93ea729e2550 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -299,6 +299,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     PrefLoopAlignment = Align(64);
     MaxInterleaveFactor = 4;
     break;
+  case Oryon:
+    CacheLineSize = 64;
+    PrefFunctionAlignment = Align(16);
+    MaxInterleaveFactor = 4;
+    PrefetchDistance = 128;
+    MinPrefetchStride = 1024;
+    break;
   }
 
   if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f49c73dc7951..9f5756fc7e40 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -58,6 +58,9 @@ static cl::opt<unsigned> InlineCallPenaltyChangeSM(
 static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
                                            cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
+                                      cl::init(true), cl::Hidden);
+
 namespace {
 class TailFoldingOption {
   // These bitfields will only ever be set to something non-zero in operator=,
@@ -4216,3 +4219,19 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
     return true;
   return BaseT::shouldTreatInstructionLikeSelect(I);
 }
+
+bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                                   const TargetTransformInfo::LSRCost &C2) {
+  // AArch64 specific here is adding the number of instructions to the
+  // comparison (though not as the first consideration, as some targets do)
+  // along with changing the priority of the base additions.
+  // TODO: Maybe a more nuanced tradeoff between instruction count
+  // and number of registers? To be investigated at a later date.
+  if (EnableLSRCostOpt)
+    return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
+                    C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+           std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
+                    C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+
+  return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 2f44aaa3e26a..feec1a4289c3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -425,6 +425,9 @@ public:
   }
 
   std::optional<unsigned> getMinPageSize() const { return 4096; }
+
+  bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+                     const TargetTransformInfo::LSRCost &C2);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8e302786c746..d0d7a9dc1724 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1534,6 +1534,12 @@ def FeatureISAVersion11_5_1 : FeatureSet<
      FeatureVGPRSingleUseHintInsts,
      Feature1_5xVGPRs])>;
 
+def FeatureISAVersion11_5_2 : FeatureSet<
+  !listconcat(FeatureISAVersion11_Common.Features,
+    [FeatureSALUFloatInsts,
+     FeatureDPPSrc1SGPR,
+     FeatureVGPRSingleUseHintInsts])>;
+
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
    FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 625ac0230f16..2bdbf4151dd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -1017,7 +1017,7 @@ public:
       //
       // TODO: We could filter out subgraphs that do not access LDS globals.
       for (Function *F : KernelsThatAllocateTableLDS)
-        removeFnAttrFromReachable(CG, F, "amdgpu-no-lds-kernel-id");
+        removeFnAttrFromReachable(CG, F, {"amdgpu-no-lds-kernel-id"});
     }
 
     DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9c94ca1e4708..17c961578382 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -57,6 +57,7 @@
 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/Scalar.h"
@@ -992,6 +993,10 @@ void AMDGPUPassConfig::addIRPasses() {
   if (isPassEnabled(EnableImageIntrinsicOptimizer))
     addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
 
+  // This can be disabled by passing ::Disable here or on the command line
+  // with --expand-variadics-override=disable.
+  addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
+
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
   addPass(createAlwaysInlinerLegacyPass());
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 2ada981a77cd..d218ffeb1fec 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -295,7 +295,11 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel,
   FeatureISAVersion11_5_1.Features
 >;
 
-// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151]
+def : ProcessorModel<"gfx1152", GFX11SpeedModel,
+  FeatureISAVersion11_5_2.Features
+>;
+
+// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152]
 def : ProcessorModel<"gfx11-generic", GFX11SpeedModel,
   FeatureISAVersion11_Generic.Features
 >;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index d7d6e00d2389..e805e964ffe4 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -113,6 +113,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: AK = GK_GFX1150; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: AK = GK_GFX1152; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:     AK = GK_GFX9_GENERIC; break;
@@ -196,6 +197,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103;
   case GK_GFX1150: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150;
   case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151;
+  case GK_GFX1152: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152;
   case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
   case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
   case GK_GFX9_GENERIC:     return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index c47eea20563d..8b42d4a1dee7 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2052,9 +2052,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
     MemInfoMap &Visited,
     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
 
-  if (!(MI.mayLoad() ^ MI.mayStore()))
-    return false;
-
   if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
     return false;
 
@@ -2065,10 +2062,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
                                               : AMDGPUAS::FLAT_ADDRESS;
 
-  if (MI.mayLoad() &&
-      TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
-    return false;
-
   if (AnchorList.count(&MI))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f178324dbbe2..5dc3457b5bfa 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,8 +103,6 @@ private:
 
   MachineBasicBlock *emitEndCf(MachineInstr &MI);
 
-  void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
-
   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
                         SmallVectorImpl<MachineOperand> &Src) const;
 
@@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
   return SplitBB;
 }
 
-void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
-                                       MachineInstr &MI) {
-  MachineFunction &MF = *MBB->getParent();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  bool IsWave32 = ST.isWave32();
-
-  if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
-    // This should be before all vector instructions.
-    MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
-            TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
-        .addImm(MI.getOperand(0).getImm());
-    if (LIS) {
-      LIS->RemoveMachineInstrFromMaps(MI);
-      LIS->InsertMachineInstrInMaps(*InitMI);
-    }
-    MI.eraseFromParent();
-    return;
-  }
-
-  // Extract the thread count from an SGPR input and set EXEC accordingly.
-  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
-  //
-  // S_BFE_U32 count, input, {shift, 7}
-  // S_BFM_B64 exec, count, 0
-  // S_CMP_EQ_U32 count, 64
-  // S_CMOV_B64 exec, -1
-  Register InputReg = MI.getOperand(0).getReg();
-  MachineInstr *FirstMI = &*MBB->begin();
-  if (InputReg.isVirtual()) {
-    MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
-    assert(DefInstr && DefInstr->isCopy());
-    if (DefInstr->getParent() == MBB) {
-      if (DefInstr != FirstMI) {
-        // If the `InputReg` is defined in current block, we also need to
-        // move that instruction to the beginning of the block.
-        DefInstr->removeFromParent();
-        MBB->insert(FirstMI, DefInstr);
-        if (LIS)
-          LIS->handleMove(*DefInstr);
-      } else {
-        // If first instruction is definition then move pointer after it.
-        FirstMI = &*std::next(FirstMI->getIterator());
-      }
-    }
-  }
-
-  // Insert instruction sequence at block beginning (before vector operations).
-  const DebugLoc DL = MI.getDebugLoc();
-  const unsigned WavefrontSize = ST.getWavefrontSize();
-  const unsigned Mask = (WavefrontSize << 1) - 1;
-  Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
-                   .addReg(InputReg)
-                   .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
-  if (LV)
-    LV->recomputeForSingleDefVirtReg(InputReg);
-  auto BfmMI =
-      BuildMI(*MBB, FirstMI, DL,
-              TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
-          .addReg(CountReg)
-          .addImm(0);
-  auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
-                   .addReg(CountReg, RegState::Kill)
-                   .addImm(WavefrontSize);
-  if (LV)
-    LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
-  auto CmovMI =
-      BuildMI(*MBB, FirstMI, DL,
-              TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
-              Exec)
-          .addImm(-1);
-
-  if (!LIS) {
-    MI.eraseFromParent();
-    return;
-  }
-
-  LIS->RemoveMachineInstrFromMaps(MI);
-  MI.eraseFromParent();
-
-  LIS->InsertMachineInstrInMaps(*BfeMI);
-  LIS->InsertMachineInstrInMaps(*BfmMI);
-  LIS->InsertMachineInstrInMaps(*CmpMI);
-  LIS->InsertMachineInstrInMaps(*CmovMI);
-
-  RecomputeRegs.insert(InputReg);
-  LIS->createAndComputeVirtRegInterval(CountReg);
-}
-
 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
   for (auto &I : MBB.instrs()) {
     if (!I.isDebugInstr() && !I.isUnconditionalBranch())
@@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
         SplitMBB = process(MI);
         Changed = true;
         break;
-
-      // FIXME: find a better place for this
-      case AMDGPU::SI_INIT_EXEC:
-      case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
-        lowerInitExec(MBB, MI);
-        if (LIS)
-          LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
-        Changed = true;
-        break;
-
-      default:
-        break;
       }
 
       if (SplitMBB != MBB) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 09dc1c781e2f..5b4c44302fa6 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -177,6 +177,7 @@ private:
   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
   SmallVector<MachineInstr *, 4> KillInstrs;
+  SmallVector<MachineInstr *, 4> InitExecInstrs;
 
   void printInfo();
 
@@ -223,6 +224,8 @@ private:
   void lowerLiveMaskQueries();
   void lowerCopyInstrs();
   void lowerKillInstrs(bool IsWQM);
+  void lowerInitExec(MachineInstr &MI);
+  void lowerInitExecInstrs();
 
 public:
   static char ID;
@@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                  Opcode == AMDGPU::SI_DEMOTE_I1) {
         KillInstrs.push_back(&MI);
         BBI.NeedsLowering = true;
+      } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
+                 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
+        InitExecInstrs.push_back(&MI);
       } else if (WQMOutputs) {
         // The function is in machine SSA form, which means that physical
         // VGPRs correspond to shader inputs and outputs. Inputs are
@@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
   }
 }
 
+void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getParent();
+  bool IsWave32 = ST->isWave32();
+
+  if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
+    // This should be before all vector instructions.
+    MachineInstr *InitMI =
+        BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+                TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+                Exec)
+            .addImm(MI.getOperand(0).getImm());
+    if (LIS) {
+      LIS->RemoveMachineInstrFromMaps(MI);
+      LIS->InsertMachineInstrInMaps(*InitMI);
+    }
+    MI.eraseFromParent();
+    return;
+  }
+
+  // Extract the thread count from an SGPR input and set EXEC accordingly.
+  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+  //
+  // S_BFE_U32 count, input, {shift, 7}
+  // S_BFM_B64 exec, count, 0
+  // S_CMP_EQ_U32 count, 64
+  // S_CMOV_B64 exec, -1
+  Register InputReg = MI.getOperand(0).getReg();
+  MachineInstr *FirstMI = &*MBB->begin();
+  if (InputReg.isVirtual()) {
+    MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
+    assert(DefInstr && DefInstr->isCopy());
+    if (DefInstr->getParent() == MBB) {
+      if (DefInstr != FirstMI) {
+        // If the `InputReg` is defined in current block, we also need to
+        // move that instruction to the beginning of the block.
+        DefInstr->removeFromParent();
+        MBB->insert(FirstMI, DefInstr);
+        if (LIS)
+          LIS->handleMove(*DefInstr);
+      } else {
+        // If first instruction is definition then move pointer after it.
+        FirstMI = &*std::next(FirstMI->getIterator());
+      }
+    }
+  }
+
+  // Insert instruction sequence at block beginning (before vector operations).
+  const DebugLoc DL = MI.getDebugLoc();
+  const unsigned WavefrontSize = ST->getWavefrontSize();
+  const unsigned Mask = (WavefrontSize << 1) - 1;
+  Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
+                   .addReg(InputReg)
+                   .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+  auto BfmMI =
+      BuildMI(*MBB, FirstMI, DL,
+              TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+          .addReg(CountReg)
+          .addImm(0);
+  auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+                   .addReg(CountReg, RegState::Kill)
+                   .addImm(WavefrontSize);
+  auto CmovMI =
+      BuildMI(*MBB, FirstMI, DL,
+              TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+              Exec)
+          .addImm(-1);
+
+  if (!LIS) {
+    MI.eraseFromParent();
+    return;
+  }
+
+  LIS->RemoveMachineInstrFromMaps(MI);
+  MI.eraseFromParent();
+
+  LIS->InsertMachineInstrInMaps(*BfeMI);
+  LIS->InsertMachineInstrInMaps(*BfmMI);
+  LIS->InsertMachineInstrInMaps(*CmpMI);
+  LIS->InsertMachineInstrInMaps(*CmovMI);
+
+  LIS->removeInterval(InputReg);
+  LIS->createAndComputeVirtRegInterval(InputReg);
+  LIS->createAndComputeVirtRegInterval(CountReg);
+}
+
+void SIWholeQuadMode::lowerInitExecInstrs() {
+  for (MachineInstr *MI : InitExecInstrs)
+    lowerInitExec(*MI);
+}
+
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
                     << " ------------- \n");
@@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LowerToCopyInstrs.clear();
   LowerToMovInstrs.clear();
   KillInstrs.clear();
+  InitExecInstrs.clear();
   StateTransition.clear();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1606,10 +1704,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   // Shader is simple does not need any state changes or any complex lowering
   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
       LowerToMovInstrs.empty() && KillInstrs.empty()) {
+    lowerInitExecInstrs();
     lowerLiveMaskQueries();
-    return !LiveMaskQueries.empty();
+    return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
   }
 
+  lowerInitExecInstrs();
+
   MachineBasicBlock &Entry = MF.front();
   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index 239e0ee70572..04c6e940e6ed 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -235,8 +235,9 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
 }
 
 void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
-                               StringRef FnAttr) {
-  KernelRoot->removeFnAttr(FnAttr);
+                               ArrayRef<StringRef> FnAttrs) {
+  for (StringRef Attr : FnAttrs)
+    KernelRoot->removeFnAttr(Attr);
 
   SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
   SmallPtrSet<Function *, 8> Visited;
@@ -261,12 +262,15 @@ void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
             Function *PotentialCallee =
                 ExternalCallRecord.second->getFunction();
             assert(PotentialCallee);
-            if (!isKernelLDS(PotentialCallee))
-              PotentialCallee->removeFnAttr(FnAttr);
+            if (!isKernelLDS(PotentialCallee)) {
+              for (StringRef Attr : FnAttrs)
+                PotentialCallee->removeFnAttr(Attr);
+            }
           }
         }
       } else {
-        Callee->removeFnAttr(FnAttr);
+        for (StringRef Attr : FnAttrs)
+          Callee->removeFnAttr(Attr);
         if (Visited.insert(Callee).second)
           WorkList.push_back(Callee);
       }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index 4d3ad328e131..e1cd4d03052b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 
@@ -54,7 +55,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M);
 /// Strip FnAttr attribute from any functions where we may have
 /// introduced its use.
 void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
-                               StringRef FnAttr);
+                               ArrayRef<StringRef> FnAttrs);
 
 /// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
 /// if this is actually a memory update or an artificial clobber to facilitate
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index a46c383115e2..919828753f45 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -115,6 +115,12 @@ static bool shouldInspect(MachineInstr &MI) {
   return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
 }
 
+static bool isHorizontalReduction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  uint64_t Flags = MCID.TSFlags;
+  return (Flags & ARMII::HorizontalReduction) != 0;
+}
+
 namespace {
 
   using InstSet = SmallPtrSetImpl<MachineInstr *>;
@@ -275,6 +281,16 @@ namespace {
       if (VPT->getOpcode() == ARM::MVE_VPST)
         return false;
 
+      // If the VPT block does not define something that is an "output", then
+      // the tail-predicated version will just perform a subset of the original
+      // vpt block, where the last lanes should not be used.
+      if (isVPTOpcode(VPT->getOpcode()) &&
+          all_of(Block.getInsts(), [](const MachineInstr *MI) {
+            return !MI->mayStore() && !MI->mayLoad() &&
+                   !isHorizontalReduction(*MI) && !isVCTP(MI);
+          }))
+        return true;
+
       auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
         MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
         return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
@@ -813,12 +829,6 @@ static bool producesDoubleWidthResult(const MachineInstr &MI) {
   return (Flags & ARMII::DoubleWidthResult) != 0;
 }
 
-static bool isHorizontalReduction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-  uint64_t Flags = MCID.TSFlags;
-  return (Flags & ARMII::HorizontalReduction) != 0;
-}
-
 // Can this instruction generate a non-zero result when given only zeroed
 // operands? This allows us to know that, given operands with false bytes
 // zeroed by masked loads, that the result will also contain zeros in those
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 51384f25d245..9d7e4636abac 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -171,6 +171,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
   // Set operations for 'F' feature.
 
   if (Subtarget.hasBasicF()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+    setTruncStoreAction(MVT::f32, MVT::f16, Expand);
     setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
 
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
@@ -186,6 +188,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
     setOperationAction(ISD::FPOW, MVT::f32, Expand);
     setOperationAction(ISD::FREM, MVT::f32, Expand);
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 
     if (Subtarget.is64Bit())
       setOperationAction(ISD::FRINT, MVT::f32, Legal);
@@ -202,7 +206,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
   // Set operations for 'D' feature.
 
   if (Subtarget.hasBasicD()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+    setTruncStoreAction(MVT::f64, MVT::f16, Expand);
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
     setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
 
@@ -219,6 +225,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
     setOperationAction(ISD::FPOW, MVT::f64, Expand);
     setOperationAction(ISD::FREM, MVT::f64, Expand);
+    setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 
     if (Subtarget.is64Bit())
       setOperationAction(ISD::FRINT, MVT::f64, Legal);
@@ -5004,6 +5012,10 @@ bool LoongArchTargetLowering::isSExtCheaperThanZExt(EVT SrcVT,
   return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
 }
 
+bool LoongArchTargetLowering::signExtendConstant(const ConstantInt *CI) const {
+  return Subtarget.is64Bit() && CI->getType()->isIntegerTy(32);
+}
+
 bool LoongArchTargetLowering::hasAndNotCompare(SDValue Y) const {
   // TODO: Support vectors.
   if (Y.getValueType().isVector())
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index f274b1971fd2..9328831a17a3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -229,6 +229,7 @@ public:
   bool isLegalAddImmediate(int64_t Imm) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
   bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
+  bool signExtendConstant(const ConstantInt *CI) const override;
 
   bool hasAndNotCompare(SDValue Y) const override;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 83466d53f84d..c29c1b593321 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -46,7 +46,7 @@ static cl::opt<bool>
 
 static std::string computeDataLayout(const Triple &TT) {
   if (TT.isArch64Bit())
-    return "e-m:e-p:64:64-i64:64-i128:128-n64-S128";
+    return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
   assert(TT.isArch32Bit() && "only LA32 and LA64 are currently supported");
   return "e-m:e-p:32:32-i64:64-n32-S128";
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 5eefab59a6ab..b0cb24c63c3c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -40,7 +40,7 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMLegacyPass();
 ModulePass *createNVPTXCtorDtorLoweringLegacyPass();
-FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
+FunctionPass *createNVVMIntrRangePass();
 FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
@@ -53,12 +53,7 @@ MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 
 struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
-  NVVMIntrRangePass();
-  NVVMIntrRangePass(unsigned SmVersion) : SmVersion(SmVersion) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-
-private:
-  unsigned SmVersion;
 };
 
 struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index f63697916d90..82770f866085 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -542,30 +542,24 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
   // If the NVVM IR has some of reqntid* specified, then output
   // the reqntid directive, and set the unspecified ones to 1.
   // If none of Reqntid* is specified, don't output reqntid directive.
-  unsigned Reqntidx, Reqntidy, Reqntidz;
-  Reqntidx = Reqntidy = Reqntidz = 1;
-  bool ReqSpecified = false;
-  ReqSpecified |= getReqNTIDx(F, Reqntidx);
-  ReqSpecified |= getReqNTIDy(F, Reqntidy);
-  ReqSpecified |= getReqNTIDz(F, Reqntidz);
+  std::optional<unsigned> Reqntidx = getReqNTIDx(F);
+  std::optional<unsigned> Reqntidy = getReqNTIDy(F);
+  std::optional<unsigned> Reqntidz = getReqNTIDz(F);
 
-  if (ReqSpecified)
-    O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz
-      << "\n";
+  if (Reqntidx || Reqntidy || Reqntidz)
+    O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1)
+      << ", " << Reqntidz.value_or(1) << "\n";
 
   // If the NVVM IR has some of maxntid* specified, then output
   // the maxntid directive, and set the unspecified ones to 1.
   // If none of maxntid* is specified, don't output maxntid directive.
-  unsigned Maxntidx, Maxntidy, Maxntidz;
-  Maxntidx = Maxntidy = Maxntidz = 1;
-  bool MaxSpecified = false;
-  MaxSpecified |= getMaxNTIDx(F, Maxntidx);
-  MaxSpecified |= getMaxNTIDy(F, Maxntidy);
-  MaxSpecified |= getMaxNTIDz(F, Maxntidz);
-
-  if (MaxSpecified)
-    O << ".maxntid " << Maxntidx << ", " << Maxntidy << ", " << Maxntidz
-      << "\n";
+  std::optional<unsigned> Maxntidx = getMaxNTIDx(F);
+  std::optional<unsigned> Maxntidy = getMaxNTIDy(F);
+  std::optional<unsigned> Maxntidz = getMaxNTIDz(F);
+
+  if (Maxntidx || Maxntidy || Maxntidz)
+    O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1)
+      << ", " << Maxntidz.value_or(1) << "\n";
 
   unsigned Mincta = 0;
   if (getMinCTASm(F, Mincta))
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 4dc3cea4bd8e..b60a1d747af7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -233,9 +233,9 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
         FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
-        // FIXME: NVVMIntrRangePass is causing numerical discrepancies,
-        // investigate and re-enable.
-        // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
+        // Note: NVVMIntrRangePass was causing numerical discrepancies at one
+        // point, if issues crop up, consider disabling.
+        FPM.addPass(NVVMIntrRangePass());
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       });
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 013afe916e86..3a536db1c972 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -128,6 +128,14 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
   return true;
 }
 
+static std::optional<unsigned>
+findOneNVVMAnnotation(const GlobalValue &GV, const std::string &PropName) {
+  unsigned RetVal;
+  if (findOneNVVMAnnotation(&GV, PropName, RetVal))
+    return RetVal;
+  return std::nullopt;
+}
+
 bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
                            std::vector<unsigned> &retval) {
   auto &AC = getAnnotationCache();
@@ -252,32 +260,57 @@ std::string getSamplerName(const Value &val) {
   return std::string(val.getName());
 }
 
-bool getMaxNTIDx(const Function &F, unsigned &x) {
-  return findOneNVVMAnnotation(&F, "maxntidx", x);
+std::optional<unsigned> getMaxNTIDx(const Function &F) {
+  return findOneNVVMAnnotation(F, "maxntidx");
 }
 
-bool getMaxNTIDy(const Function &F, unsigned &y) {
-  return findOneNVVMAnnotation(&F, "maxntidy", y);
+std::optional<unsigned> getMaxNTIDy(const Function &F) {
+  return findOneNVVMAnnotation(F, "maxntidy");
 }
 
-bool getMaxNTIDz(const Function &F, unsigned &z) {
-  return findOneNVVMAnnotation(&F, "maxntidz", z);
+std::optional<unsigned> getMaxNTIDz(const Function &F) {
+  return findOneNVVMAnnotation(F, "maxntidz");
+}
+
+std::optional<unsigned> getMaxNTID(const Function &F) {
+  // Note: The semantics here are a bit strange. The PTX ISA states the
+  // following (11.4.2. Performance-Tuning Directives: .maxntid):
+  //
+  //  Note that this directive guarantees that the total number of threads does
+  //  not exceed the maximum, but does not guarantee that the limit in any
+  //  particular dimension is not exceeded.
+  std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F);
+  std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F);
+  std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F);
+  if (MaxNTIDx || MaxNTIDy || MaxNTIDz)
+    return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1);
+  return std::nullopt;
 }
 
 bool getMaxClusterRank(const Function &F, unsigned &x) {
   return findOneNVVMAnnotation(&F, "maxclusterrank", x);
 }
 
-bool getReqNTIDx(const Function &F, unsigned &x) {
-  return findOneNVVMAnnotation(&F, "reqntidx", x);
+std::optional<unsigned> getReqNTIDx(const Function &F) {
+  return findOneNVVMAnnotation(F, "reqntidx");
+}
+
+std::optional<unsigned> getReqNTIDy(const Function &F) {
+  return findOneNVVMAnnotation(F, "reqntidy");
 }
 
-bool getReqNTIDy(const Function &F, unsigned &y) {
-  return findOneNVVMAnnotation(&F, "reqntidy", y);
+std::optional<unsigned> getReqNTIDz(const Function &F) {
+  return findOneNVVMAnnotation(F, "reqntidz");
 }
 
-bool getReqNTIDz(const Function &F, unsigned &z) {
-  return findOneNVVMAnnotation(&F, "reqntidz", z);
+std::optional<unsigned> getReqNTID(const Function &F) {
+  // Note: The semantics here are a bit strange. See getMaxNTID.
+  std::optional<unsigned> ReqNTIDx = getReqNTIDx(F);
+  std::optional<unsigned> ReqNTIDy = getReqNTIDy(F);
+  std::optional<unsigned> ReqNTIDz = getReqNTIDz(F);
+  if (ReqNTIDx || ReqNTIDy || ReqNTIDz)
+    return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1);
+  return std::nullopt;
 }
 
 bool getMinCTASm(const Function &F, unsigned &x) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 2872db9fa213..e020bc0f02e9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -48,13 +48,15 @@ std::string getTextureName(const Value &);
 std::string getSurfaceName(const Value &);
 std::string getSamplerName(const Value &);
 
-bool getMaxNTIDx(const Function &, unsigned &);
-bool getMaxNTIDy(const Function &, unsigned &);
-bool getMaxNTIDz(const Function &, unsigned &);
-
-bool getReqNTIDx(const Function &, unsigned &);
-bool getReqNTIDy(const Function &, unsigned &);
-bool getReqNTIDz(const Function &, unsigned &);
+std::optional<unsigned> getMaxNTIDx(const Function &);
+std::optional<unsigned> getMaxNTIDy(const Function &);
+std::optional<unsigned> getMaxNTIDz(const Function &);
+std::optional<unsigned> getMaxNTID(const Function &F);
+
+std::optional<unsigned> getReqNTIDx(const Function &);
+std::optional<unsigned> getReqNTIDy(const Function &);
+std::optional<unsigned> getReqNTIDz(const Function &);
+std::optional<unsigned> getReqNTID(const Function &);
 
 bool getMaxClusterRank(const Function &, unsigned &);
 bool getMinCTASm(const Function &, unsigned &);
diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
index 5381646434eb..f9d21b38a7ec 100644
--- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -1,4 +1,4 @@
-//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===//
+//===- NVVMIntrRange.cpp - Set range attributes for NVVM intrinsics -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,19 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass adds appropriate !range metadata for calls to NVVM
+// This pass adds appropriate range attributes for calls to NVVM
 // intrinsics that return a limited range of values.
 //
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
-#include "llvm/IR/Constants.h"
+#include "NVPTXUtilities.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
+#include <cstdint>
 
 using namespace llvm;
 
@@ -26,31 +28,20 @@ using namespace llvm;
 
 namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); }
 
-// Add !range metadata based on limits of given SM variant.
-static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20),
-                                         cl::Hidden, cl::desc("SM variant"));
-
 namespace {
 class NVVMIntrRange : public FunctionPass {
- private:
-   unsigned SmVersion;
-
- public:
-   static char ID;
-   NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {}
-   NVVMIntrRange(unsigned int SmVersion)
-       : FunctionPass(ID), SmVersion(SmVersion) {
+public:
+  static char ID;
+  NVVMIntrRange() : FunctionPass(ID) {
 
-     initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
-   }
+    initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
+  }
 
-   bool runOnFunction(Function &) override;
+  bool runOnFunction(Function &) override;
 };
-}
+} // namespace
 
-FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) {
-  return new NVVMIntrRange(SmVersion);
-}
+FunctionPass *llvm::createNVVMIntrRangePass() { return new NVVMIntrRange(); }
 
 char NVVMIntrRange::ID = 0;
 INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range",
@@ -58,112 +49,110 @@ INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range",
 
 // Adds the passed-in [Low,High) range information as metadata to the
 // passed-in call instruction.
-static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) {
-  // This call already has range metadata, nothing to do.
-  if (C->getMetadata(LLVMContext::MD_range))
+static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) {
+  if (II->getMetadata(LLVMContext::MD_range))
     return false;
 
-  LLVMContext &Context = C->getParent()->getContext();
-  IntegerType *Int32Ty = Type::getInt32Ty(Context);
-  Metadata *LowAndHigh[] = {
-      ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)),
-      ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))};
-  C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh));
+  const uint64_t BitWidth = II->getType()->getIntegerBitWidth();
+  ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High));
+
+  if (auto CurrentRange = II->getRange())
+    Range = Range.intersectWith(CurrentRange.value());
+
+  II->addRangeRetAttr(Range);
   return true;
 }
 
-static bool runNVVMIntrRange(Function &F, unsigned SmVersion) {
+static bool runNVVMIntrRange(Function &F) {
   struct {
     unsigned x, y, z;
   } MaxBlockSize, MaxGridSize;
-  MaxBlockSize.x = 1024;
-  MaxBlockSize.y = 1024;
-  MaxBlockSize.z = 64;
 
-  MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
+  const unsigned MetadataNTID = getReqNTID(F).value_or(
+      getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
+
+  MaxBlockSize.x = std::min(1024u, MetadataNTID);
+  MaxBlockSize.y = std::min(1024u, MetadataNTID);
+  MaxBlockSize.z = std::min(64u, MetadataNTID);
+
+  MaxGridSize.x = 0x7fffffff;
   MaxGridSize.y = 0xffff;
   MaxGridSize.z = 0xffff;
 
   // Go through the calls in this function.
   bool Changed = false;
   for (Instruction &I : instructions(F)) {
-    CallInst *Call = dyn_cast<CallInst>(&I);
-    if (!Call)
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II)
       continue;
 
-    if (Function *Callee = Call->getCalledFunction()) {
-      switch (Callee->getIntrinsicID()) {
-      // Index within block
-      case Intrinsic::nvvm_read_ptx_sreg_tid_x:
-        Changed |= addRangeMetadata(0, MaxBlockSize.x, Call);
-        break;
-      case Intrinsic::nvvm_read_ptx_sreg_tid_y:
-        Changed |= addRangeMetadata(0, MaxBlockSize.y, Call);
-        break;
-      case Intrinsic::nvvm_read_ptx_sreg_tid_z:
-        Changed |= addRangeMetadata(0, MaxBlockSize.z, Call);
-        break;
-
-      // Block size
-      case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
-        Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call);
-        break;
-      case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
-        Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call);
-        break;
-      case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
-        Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call);
-        break;
-
-      // Index within grid
-      case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
-        Changed |= addRangeMetadata(0, MaxGridSize.x, Call);
-        break;
-      case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
-        Changed |= addRangeMetadata(0, MaxGridSize.y, Call);
-        break;
-      case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
-        Changed |= addRangeMetadata(0, MaxGridSize.z, Call);
-        break;
-
-      // Grid size
-      case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
-        Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call);
-        break;
-      case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
-        Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call);
-        break;
-      case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
-        Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call);
-        break;
-
-      // warp size is constant 32.
-      case Intrinsic::nvvm_read_ptx_sreg_warpsize:
-        Changed |= addRangeMetadata(32, 32+1, Call);
-        break;
-
-      // Lane ID is [0..warpsize)
-      case Intrinsic::nvvm_read_ptx_sreg_laneid:
-        Changed |= addRangeMetadata(0, 32, Call);
-        break;
-
-      default:
-        break;
-      }
+    switch (II->getIntrinsicID()) {
+    // Index within block
+    case Intrinsic::nvvm_read_ptx_sreg_tid_x:
+      Changed |= addRangeAttr(0, MaxBlockSize.x, II);
+      break;
+    case Intrinsic::nvvm_read_ptx_sreg_tid_y:
+      Changed |= addRangeAttr(0, MaxBlockSize.y, II);
+      break;
+    case Intrinsic::nvvm_read_ptx_sreg_tid_z:
+      Changed |= addRangeAttr(0, MaxBlockSize.z, II);
+      break;
+
+    // Block size
+    case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
+      Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II);
+      break;
+    case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
+      Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II);
+      break;
+    case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
+      Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II);
+      break;
+
+    // Index within grid
+    case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
+      Changed |= addRangeAttr(0, MaxGridSize.x, II);
+      break;
+    case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
+      Changed |= addRangeAttr(0, MaxGridSize.y, II);
+      break;
+    case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
+      Changed |= addRangeAttr(0, MaxGridSize.z, II);
+      break;
+
+    // Grid size
+    case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
+      Changed |= addRangeAttr(1, MaxGridSize.x + 1, II);
+      break;
+    case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
+      Changed |= addRangeAttr(1, MaxGridSize.y + 1, II);
+      break;
+    case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
+      Changed |= addRangeAttr(1, MaxGridSize.z + 1, II);
+      break;
+
+    // warp size is constant 32.
+    case Intrinsic::nvvm_read_ptx_sreg_warpsize:
+      Changed |= addRangeAttr(32, 32 + 1, II);
+      break;
+
+    // Lane ID is [0..warpsize)
+    case Intrinsic::nvvm_read_ptx_sreg_laneid:
+      Changed |= addRangeAttr(0, 32, II);
+      break;
+
+    default:
+      break;
     }
   }
 
   return Changed;
 }
 
-bool NVVMIntrRange::runOnFunction(Function &F) {
-  return runNVVMIntrRange(F, SmVersion);
-}
-
-NVVMIntrRangePass::NVVMIntrRangePass() : NVVMIntrRangePass(NVVMIntrRangeSM) {}
+bool NVVMIntrRange::runOnFunction(Function &F) { return runNVVMIntrRange(F); }
 
 PreservedAnalyses NVVMIntrRangePass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
-  return runNVVMIntrRange(F, SmVersion) ? PreservedAnalyses::none()
-                                        : PreservedAnalyses::all();
+  return runNVVMIntrRange(F) ? PreservedAnalyses::none()
+                             : PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index f4e84ade3b5a..bc0ae7a32c05 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1079,13 +1079,13 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     assert(IsAIX && TM.getCodeModel() == CodeModel::Small &&
            "PseudoOp only valid for small code model AIX");
 
-    // Transform %rN = ADDItoc/8 @op1, %r2.
+    // Transform %rN = ADDItoc/8 %r2, @op1.
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to load address.
     TmpInst.setOpcode((!IsPPC64) ? (PPC::LA) : (PPC::LA8));
 
-    const MachineOperand &MO = MI->getOperand(1);
+    const MachineOperand &MO = MI->getOperand(2);
     assert(MO.isGlobal() && "Invalid operand for ADDItoc[8].");
 
     // Map the operand to its corresponding MCSymbol.
@@ -1094,7 +1094,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MCExpr *Exp =
         MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_None, OutContext);
 
-    TmpInst.getOperand(1) = TmpInst.getOperand(2);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 735050641adf..a07954bd0d8b 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -2080,13 +2080,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
                       cast<GlobalVariable>(GV)->hasAttribute("toc-data");
 
   // For small code model, generate a simple TOC load.
-  if (CModel == CodeModel::Small)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
-            IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc),
-            DestReg)
-        .addGlobalAddress(GV)
-        .addReg(PPC::X2);
-  else {
+  if (CModel == CodeModel::Small) {
+    auto MIB = BuildMI(
+        *FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+        IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc), DestReg);
+    if (IsAIXTocData)
+      MIB.addReg(PPC::X2).addGlobalAddress(GV);
+    else
+      MIB.addGlobalAddress(GV).addReg(PPC::X2);
+  } else {
     // If the address is an externally defined symbol, a symbol with common
     // or externally available linkage, a non-local function address, or a
     // jump table address (not yet needed), or if we are generating code
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 275b3337a276..1a69d1e89313 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -6102,8 +6102,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
                                    EVT OperandTy) {
       SDValue GA = TocEntry->getOperand(0);
       SDValue TocBase = TocEntry->getOperand(1);
-      SDNode *MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase);
-      transferMemOperands(TocEntry, MN);
+      SDNode *MN = nullptr;
+      if (OpCode == PPC::ADDItoc || OpCode == PPC::ADDItoc8)
+        // toc-data access doesn't involve in loading from got, no need to
+        // keep memory operands.
+        MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, TocBase, GA);
+      else {
+        MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase);
+        transferMemOperands(TocEntry, MN);
+      }
       ReplaceNode(TocEntry, MN);
     };
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 9af8ada78376..eda5eb975e70 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1485,11 +1485,9 @@ def ADDItocL8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:
 }
 
 // Local Data Transform
-def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
+def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                    "#ADDItoc8",
-                   [(set i64:$rD,
-                     (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64;
-
+                   []>, isPPC64;
 let mayLoad = 1 in
 def LDtocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
                    "#LDtocL", []>, isPPC64;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index df6b2bf1a7b7..09f829943528 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -3345,10 +3345,8 @@ def LWZtocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc_nor
 def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp),
                        "#ADDIStocHA", []>;
 // TOC Data Transform on AIX
-def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
-                   "#ADDItoc",
-                   [(set i32:$rD,
-                     (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$reg, tocentry32:$disp),
+                   "#ADDItoc", []>;
 def ADDItocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp),
                    "#ADDItocL", []>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index a96768240a93..82358cdd45ed 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -932,11 +932,11 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
            "Can't handle X0, X0 vsetvli yet");
     if (AVLReg == RISCV::X0)
       NewInfo.setAVLVLMAX();
-    else if (VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS))
-      NewInfo.setAVLRegDef(VNI, AVLReg);
-    else {
-      assert(MI.getOperand(1).isUndef());
+    else if (MI.getOperand(1).isUndef())
       NewInfo.setAVLIgnored();
+    else {
+      VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS);
+      NewInfo.setAVLRegDef(VNI, AVLReg);
     }
   }
   NewInfo.setVTYPE(MI.getOperand(2).getImm());
@@ -1008,11 +1008,11 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
       }
       else
         InstrInfo.setAVLImm(Imm);
-    } else if (VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS)) {
-      InstrInfo.setAVLRegDef(VNI, VLOp.getReg());
-    } else {
-      assert(VLOp.isUndef());
+    } else if (VLOp.isUndef()) {
       InstrInfo.setAVLIgnored();
+    } else {
+      VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS);
+      InstrInfo.setAVLRegDef(VNI, VLOp.getReg());
     }
   } else {
     assert(isScalarExtractInstr(MI));
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 6d926ce551e0..b0949f5fc1d7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -1033,6 +1033,22 @@ class VPseudoUnaryNoMask<DAGOperand RetClass,
   let HasVecPolicyOp = 1;
 }
 
+class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass,
+                                 DAGOperand OpClass,
+                                 string Constraint = "",
+                                 int TargetConstraintType = 1> :
+      Pseudo<(outs RetClass:$rd),
+             (ins OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>,
+      RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = Constraint;
+  let TargetOverlapConstraintType = TargetConstraintType;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+}
+
 class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass,
                                      DAGOperand OpClass,
                                      string Constraint = "",
@@ -1422,24 +1438,6 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
   let UsesVXRM = 0;
 }
 
-// Like VPseudoBinaryNoMask, but output can be V0.
-class VPseudoBinaryMOutNoMask<VReg RetClass,
-                              VReg Op1Class,
-                              DAGOperand Op2Class,
-                              string Constraint,
-                              int TargetConstraintType = 1> :
-      Pseudo<(outs RetClass:$rd),
-             (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
-      RISCVVPseudo {
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let Constraints = Constraint;
-  let TargetOverlapConstraintType = TargetConstraintType;
-  let HasVLOp = 1;
-  let HasSEWOp = 1;
-}
-
 // Like VPseudoBinaryMask, but output can be V0.
 class VPseudoBinaryMOutMask<VReg RetClass,
                             RegisterClass Op1Class,
@@ -2056,9 +2054,10 @@ multiclass VPseudoVSFS_M {
   foreach mti = AllMasks in {
     defvar mx = mti.LMul.MX;
     let VLMul = mti.LMul.value in {
-      def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>,
+      def "_M_" # mti.BX : VPseudoUnaryNoMaskNoPolicy<VR, VR, constraint>,
                            SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx,
                                       forceMergeOpRead=true>;
+      let ForceTailAgnostic = true in
       def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>,
                                      SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx,
                                                 forceMergeOpRead=true>;
@@ -2172,8 +2171,8 @@ multiclass VPseudoBinaryM<VReg RetClass,
                           int TargetConstraintType = 1,
                           bit Commutable = 0> {
   let VLMul = MInfo.value, isCommutable = Commutable in {
-    def "_" # MInfo.MX : VPseudoBinaryMOutNoMask<RetClass, Op1Class, Op2Class,
-                                                 Constraint, TargetConstraintType>;
+    def "_" # MInfo.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
+                                             Constraint, TargetConstraintType>;
     let ForceTailAgnostic = true in
     def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMOutMask<RetClass, Op1Class,
                                                          Op2Class, Constraint, TargetConstraintType>,
@@ -4078,9 +4077,8 @@ class VPatMaskUnaryNoMask<string intrinsic_name,
                 (mti.Mask VR:$rs2),
                 VLOpFrag)),
                 (!cast<Instruction>(inst#"_M_"#mti.BX)
-                (mti.Mask (IMPLICIT_DEF)),
                 (mti.Mask VR:$rs2),
-                GPR:$vl, mti.Log2SEW, TA_MA)>;
+                GPR:$vl, mti.Log2SEW)>;
 
 class VPatMaskUnaryMask<string intrinsic_name,
                         string inst,
@@ -4153,27 +4151,6 @@ class VPatBinaryNoMaskTU<string intrinsic_name,
                    (op2_type op2_kind:$rs2),
                    GPR:$vl, sew, TU_MU)>;
 
-class VPatBinaryNoMaskRoundingMode<string intrinsic_name,
-                                   string inst,
-                                   ValueType result_type,
-                                   ValueType op1_type,
-                                   ValueType op2_type,
-                                   int sew,
-                                   VReg op1_reg_class,
-                                   DAGOperand op2_kind> :
-  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type (undef)),
-                   (op1_type op1_reg_class:$rs1),
-                   (op2_type op2_kind:$rs2),
-                   (XLenVT timm:$round),
-                   VLOpFrag)),
-                   (!cast<Instruction>(inst)
-                   (result_type (IMPLICIT_DEF)),
-                   (op1_type op1_reg_class:$rs1),
-                   (op2_type op2_kind:$rs2),
-                   (XLenVT timm:$round),
-                   GPR:$vl, sew, TA_MA)>;
-
 class VPatBinaryNoMaskTURoundingMode<string intrinsic_name,
                                      string inst,
                                      ValueType result_type,
@@ -4827,8 +4804,6 @@ multiclass VPatBinaryRoundingMode<string intrinsic,
                                   VReg result_reg_class,
                                   VReg op1_reg_class,
                                   DAGOperand op2_kind> {
-  def : VPatBinaryNoMaskRoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
-                                       sew, op1_reg_class, op2_kind>;
   def : VPatBinaryNoMaskTURoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
                                        sew, result_reg_class, op1_reg_class, op2_kind>;
   def : VPatBinaryMaskTARoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
@@ -6962,12 +6937,12 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
 foreach vti = AllIntegerVectors in {
   // Emit shift by 1 as an add since it might be faster.
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector undef),
+    def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$merge),
                                           (vti.Vector vti.RegClass:$rs1),
                                           (XLenVT 1), VLOpFrag)),
               (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX)
-                 (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1,
-                 vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TA_MA)>;
+                 vti.RegClass:$merge, vti.RegClass:$rs1,
+                 vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TU_MU)>;
     def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$merge),
                                                (vti.Vector vti.RegClass:$rs1),
                                                (XLenVT 1),
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 956b851fce6c..49838e685a6d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1459,11 +1459,22 @@ static bool generateImageSizeQueryInst(const SPIRV::IncomingCall *Call,
         Component == 3 ? NumActualRetComponents - 1 : Component;
     assert(ExtractedComposite < NumActualRetComponents &&
            "Invalid composite index!");
+    Register TypeReg = GR->getSPIRVTypeID(Call->ReturnType);
+    SPIRVType *NewType = nullptr;
+    if (QueryResultType->getOpcode() == SPIRV::OpTypeVector) {
+      Register NewTypeReg = QueryResultType->getOperand(1).getReg();
+      if (TypeReg != NewTypeReg &&
+          (NewType = GR->getSPIRVTypeForVReg(NewTypeReg)) != nullptr)
+        TypeReg = NewTypeReg;
+    }
     MIRBuilder.buildInstr(SPIRV::OpCompositeExtract)
         .addDef(Call->ReturnRegister)
-        .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+        .addUse(TypeReg)
         .addUse(QueryResult)
         .addImm(ExtractedComposite);
+    if (NewType != nullptr)
+      insertAssignInstr(Call->ReturnRegister, nullptr, NewType, GR, MIRBuilder,
+                        MIRBuilder.getMF().getRegInfo());
   } else {
     // More than 1 component is expected, fill a new vector.
     auto MIB = MIRBuilder.buildInstr(SPIRV::OpVectorShuffle)
@@ -2063,16 +2074,30 @@ static bool generateAsyncCopy(const SPIRV::IncomingCall *Call,
   auto Scope = buildConstantIntReg(SPIRV::Scope::Workgroup, MIRBuilder, GR);
 
   switch (Opcode) {
-  case SPIRV::OpGroupAsyncCopy:
-    return MIRBuilder.buildInstr(Opcode)
-        .addDef(Call->ReturnRegister)
-        .addUse(GR->getSPIRVTypeID(Call->ReturnType))
-        .addUse(Scope)
-        .addUse(Call->Arguments[0])
-        .addUse(Call->Arguments[1])
-        .addUse(Call->Arguments[2])
-        .addUse(buildConstantIntReg(1, MIRBuilder, GR))
-        .addUse(Call->Arguments[3]);
+  case SPIRV::OpGroupAsyncCopy: {
+    SPIRVType *NewType =
+        Call->ReturnType->getOpcode() == SPIRV::OpTypeEvent
+            ? nullptr
+            : GR->getOrCreateSPIRVTypeByName("spirv.Event", MIRBuilder);
+    Register TypeReg = GR->getSPIRVTypeID(NewType ? NewType : Call->ReturnType);
+    unsigned NumArgs = Call->Arguments.size();
+    Register EventReg = Call->Arguments[NumArgs - 1];
+    bool Res = MIRBuilder.buildInstr(Opcode)
+                   .addDef(Call->ReturnRegister)
+                   .addUse(TypeReg)
+                   .addUse(Scope)
+                   .addUse(Call->Arguments[0])
+                   .addUse(Call->Arguments[1])
+                   .addUse(Call->Arguments[2])
+                   .addUse(Call->Arguments.size() > 4
+                               ? Call->Arguments[3]
+                               : buildConstantIntReg(1, MIRBuilder, GR))
+                   .addUse(EventReg);
+    if (NewType != nullptr)
+      insertAssignInstr(Call->ReturnRegister, nullptr, NewType, GR, MIRBuilder,
+                        MIRBuilder.getMF().getRegInfo());
+    return Res;
+  }
   case SPIRV::OpGroupWaitEvents:
     return MIRBuilder.buildInstr(Opcode)
         .addUse(Scope)
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 24c6c2688642..edc9e1a33d9f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -586,6 +586,7 @@ defm : DemangledNativeBuiltin<"__spirv_SpecConstantComposite", OpenCL_std, SpecC
 
 // Async Copy and Prefetch builtin records:
 defm : DemangledNativeBuiltin<"async_work_group_copy", OpenCL_std, AsyncCopy, 4, 4, OpGroupAsyncCopy>;
+defm : DemangledNativeBuiltin<"async_work_group_strided_copy", OpenCL_std, AsyncCopy, 5, 5, OpGroupAsyncCopy>;
 defm : DemangledNativeBuiltin<"__spirv_GroupAsyncCopy", OpenCL_std, AsyncCopy, 6, 6, OpGroupAsyncCopy>;
 defm : DemangledNativeBuiltin<"wait_group_events", OpenCL_std, AsyncCopy, 2, 2, OpGroupWaitEvents>;
 defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy, 3, 3, OpGroupWaitEvents>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 5ef0be1cab72..bbd25dc85f52 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -61,9 +61,6 @@ class SPIRVEmitIntrinsics
   DenseMap<Instruction *, Type *> AggrConstTypes;
   DenseSet<Instruction *> AggrStores;
 
-  // a registry of created Intrinsic::spv_assign_ptr_type instructions
-  DenseMap<Value *, CallInst *> AssignPtrTypeInstr;
-
   // deduce element type of untyped pointers
   Type *deduceElementType(Value *I);
   Type *deduceElementTypeHelper(Value *I);
@@ -98,14 +95,16 @@ class SPIRVEmitIntrinsics
     return B.CreateIntrinsic(IntrID, {Types}, Args);
   }
 
+  void buildAssignType(IRBuilder<> &B, Type *ElemTy, Value *Arg);
   void buildAssignPtr(IRBuilder<> &B, Type *ElemTy, Value *Arg);
+  void updateAssignType(CallInst *AssignCI, Value *Arg, Value *OfType);
 
   void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B);
   void processInstrAfterVisit(Instruction *I, IRBuilder<> &B);
   void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B);
   void insertAssignTypeIntrs(Instruction *I, IRBuilder<> &B);
-  void insertAssignTypeInstrForTargetExtTypes(TargetExtType *AssignedType,
-                                              Value *V, IRBuilder<> &B);
+  void insertAssignPtrTypeTargetExt(TargetExtType *AssignedType, Value *V,
+                                    IRBuilder<> &B);
   void replacePointerOperandWithPtrCast(Instruction *I, Value *Pointer,
                                         Type *ExpectedElementType,
                                         unsigned OperandToReplace,
@@ -218,15 +217,39 @@ static inline void reportFatalOnTokenType(const Instruction *I) {
                        false);
 }
 
+void SPIRVEmitIntrinsics::buildAssignType(IRBuilder<> &B, Type *Ty,
+                                          Value *Arg) {
+  Value *OfType = PoisonValue::get(Ty);
+  CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type,
+                                       {Arg->getType()}, OfType, Arg, {}, B);
+  GR->addAssignPtrTypeInstr(Arg, AssignCI);
+}
+
 void SPIRVEmitIntrinsics::buildAssignPtr(IRBuilder<> &B, Type *ElemTy,
                                          Value *Arg) {
-  CallInst *AssignPtrTyCI =
-      buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Arg->getType()},
-                      Constant::getNullValue(ElemTy), Arg,
-                      {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B);
+  Value *OfType = PoisonValue::get(ElemTy);
+  CallInst *AssignPtrTyCI = buildIntrWithMD(
+      Intrinsic::spv_assign_ptr_type, {Arg->getType()}, OfType, Arg,
+      {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B);
   GR->addDeducedElementType(AssignPtrTyCI, ElemTy);
   GR->addDeducedElementType(Arg, ElemTy);
-  AssignPtrTypeInstr[Arg] = AssignPtrTyCI;
+  GR->addAssignPtrTypeInstr(Arg, AssignPtrTyCI);
+}
+
+void SPIRVEmitIntrinsics::updateAssignType(CallInst *AssignCI, Value *Arg,
+                                           Value *OfType) {
+  LLVMContext &Ctx = Arg->getContext();
+  AssignCI->setArgOperand(
+      1, MetadataAsValue::get(
+             Ctx, MDNode::get(Ctx, ValueAsMetadata::getConstant(OfType))));
+  if (cast<IntrinsicInst>(AssignCI)->getIntrinsicID() !=
+      Intrinsic::spv_assign_ptr_type)
+    return;
+
+  // update association with the pointee type
+  Type *ElemTy = OfType->getType();
+  GR->addDeducedElementType(AssignCI, ElemTy);
+  GR->addDeducedElementType(Arg, ElemTy);
 }
 
 // Set element pointer type to the given value of ValueTy and tries to
@@ -513,19 +536,16 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(Instruction *I) {
     if (!Ty) {
       GR->addDeducedElementType(Op, KnownElemTy);
       // check if there is existing Intrinsic::spv_assign_ptr_type instruction
-      auto It = AssignPtrTypeInstr.find(Op);
-      if (It == AssignPtrTypeInstr.end()) {
+      CallInst *AssignCI = GR->findAssignPtrTypeInstr(Op);
+      if (AssignCI == nullptr) {
         Instruction *User = dyn_cast<Instruction>(Op->use_begin()->get());
         setInsertPointSkippingPhis(B, User ? User->getNextNode() : I);
         CallInst *CI =
             buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {OpTy}, OpTyVal, Op,
                             {B.getInt32(getPointerAddressSpace(OpTy))}, B);
-        AssignPtrTypeInstr[Op] = CI;
+        GR->addAssignPtrTypeInstr(Op, CI);
       } else {
-        It->second->setArgOperand(
-            1,
-            MetadataAsValue::get(
-                Ctx, MDNode::get(Ctx, ValueAsMetadata::getConstant(OpTyVal))));
+        updateAssignType(AssignCI, Op, OpTyVal);
       }
     } else {
       if (auto *OpI = dyn_cast<Instruction>(Op)) {
@@ -559,7 +579,9 @@ void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old,
     if (isAssignTypeInstr(U)) {
       B.SetInsertPoint(U);
       SmallVector<Value *, 2> Args = {New, U->getOperand(1)};
-      B.CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args);
+      CallInst *AssignCI =
+          B.CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args);
+      GR->addAssignPtrTypeInstr(New, AssignCI);
       U->eraseFromParent();
     } else if (isMemInstrToReplace(U) || isa<ReturnInst>(U) ||
                isa<CallInst>(U)) {
@@ -751,33 +773,39 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
   return NewI;
 }
 
-void SPIRVEmitIntrinsics::insertAssignTypeInstrForTargetExtTypes(
+void SPIRVEmitIntrinsics::insertAssignPtrTypeTargetExt(
     TargetExtType *AssignedType, Value *V, IRBuilder<> &B) {
-  // Do not emit spv_assign_type if the V is of the AssignedType already.
-  if (V->getType() == AssignedType)
-    return;
+  Type *VTy = V->getType();
 
-  // Do not emit spv_assign_type if there is one already targetting V. If the
-  // found spv_assign_type assigns a type different than AssignedType, report an
-  // error. Builtin types cannot be redeclared or casted.
-  for (auto User : V->users()) {
-    auto *II = dyn_cast<IntrinsicInst>(User);
-    if (!II || II->getIntrinsicID() != Intrinsic::spv_assign_type)
-      continue;
+  // A couple of sanity checks.
+  assert(isPointerTy(VTy) && "Expect a pointer type!");
+  if (auto PType = dyn_cast<TypedPointerType>(VTy))
+    if (PType->getElementType() != AssignedType)
+      report_fatal_error("Unexpected pointer element type!");
 
-    MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1));
-    Type *BuiltinType =
-        dyn_cast<ConstantAsMetadata>(VMD->getMetadata())->getType();
-    if (BuiltinType != AssignedType)
-      report_fatal_error("Type mismatch " + BuiltinType->getTargetExtName() +
-                             "/" + AssignedType->getTargetExtName() +
-                             " for value " + V->getName(),
-                         false);
+  CallInst *AssignCI = GR->findAssignPtrTypeInstr(V);
+  if (!AssignCI) {
+    buildAssignType(B, AssignedType, V);
     return;
   }
 
-  Constant *Const = UndefValue::get(AssignedType);
-  buildIntrWithMD(Intrinsic::spv_assign_type, {V->getType()}, Const, V, {}, B);
+  Type *CurrentType =
+      dyn_cast<ConstantAsMetadata>(
+          cast<MetadataAsValue>(AssignCI->getOperand(1))->getMetadata())
+          ->getType();
+  if (CurrentType == AssignedType)
+    return;
+
+  // Builtin types cannot be redeclared or casted.
+  if (CurrentType->isTargetExtTy())
+    report_fatal_error("Type mismatch " + CurrentType->getTargetExtName() +
+                           "/" + AssignedType->getTargetExtName() +
+                           " for value " + V->getName(),
+                       false);
+
+  // Our previous guess about the type seems to be wrong, let's update
+  // inferred type according to a new, more precise type information.
+  updateAssignType(AssignCI, V, PoisonValue::get(AssignedType));
 }
 
 void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
@@ -850,7 +878,7 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
         ExpectedElementTypeConst, Pointer, {B.getInt32(AddressSpace)}, B);
     GR->addDeducedElementType(CI, ExpectedElementType);
     GR->addDeducedElementType(Pointer, ExpectedElementType);
-    AssignPtrTypeInstr[Pointer] = CI;
+    GR->addAssignPtrTypeInstr(Pointer, CI);
     return;
   }
 
@@ -929,8 +957,7 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
 
   for (unsigned OpIdx = 0; OpIdx < CI->arg_size(); OpIdx++) {
     Value *ArgOperand = CI->getArgOperand(OpIdx);
-    if (!isa<PointerType>(ArgOperand->getType()) &&
-        !isa<TypedPointerType>(ArgOperand->getType()))
+    if (!isPointerTy(ArgOperand->getType()))
       continue;
 
     // Constants (nulls/undefs) are handled in insertAssignPtrTypeIntrs()
@@ -952,8 +979,8 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
       continue;
 
     if (ExpectedType->isTargetExtTy())
-      insertAssignTypeInstrForTargetExtTypes(cast<TargetExtType>(ExpectedType),
-                                             ArgOperand, B);
+      insertAssignPtrTypeTargetExt(cast<TargetExtType>(ExpectedType),
+                                   ArgOperand, B);
     else
       replacePointerOperandWithPtrCast(CI, ArgOperand, ExpectedType, OpIdx, B);
   }
@@ -1145,7 +1172,7 @@ void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
   CallInst *CI = buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {I->getType()},
                                  EltTyConst, I, {B.getInt32(AddressSpace)}, B);
   GR->addDeducedElementType(CI, ElemTy);
-  AssignPtrTypeInstr[I] = CI;
+  GR->addAssignPtrTypeInstr(I, CI);
 }
 
 void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
@@ -1164,20 +1191,32 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
         TypeToAssign = It->second;
       }
     }
-    Constant *Const = UndefValue::get(TypeToAssign);
-    buildIntrWithMD(Intrinsic::spv_assign_type, {Ty}, Const, I, {}, B);
+    buildAssignType(B, TypeToAssign, I);
   }
   for (const auto &Op : I->operands()) {
     if (isa<ConstantPointerNull>(Op) || isa<UndefValue>(Op) ||
         // Check GetElementPtrConstantExpr case.
         (isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) {
       setInsertPointSkippingPhis(B, I);
-      if (isa<UndefValue>(Op) && Op->getType()->isAggregateType())
-        buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op,
-                        UndefValue::get(B.getInt32Ty()), {}, B);
-      else if (!isa<Instruction>(Op))
-        buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op, {},
-                        B);
+      Type *OpTy = Op->getType();
+      if (isa<UndefValue>(Op) && OpTy->isAggregateType()) {
+        CallInst *AssignCI =
+            buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op,
+                            UndefValue::get(B.getInt32Ty()), {}, B);
+        GR->addAssignPtrTypeInstr(Op, AssignCI);
+      } else if (!isa<Instruction>(Op)) {
+        Type *OpTy = Op->getType();
+        if (auto PType = dyn_cast<TypedPointerType>(OpTy)) {
+          buildAssignPtr(B, PType->getElementType(), Op);
+        } else if (isPointerTy(OpTy)) {
+          Type *ElemTy = GR->findDeducedElementType(Op);
+          buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op), Op);
+        } else {
+          CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type,
+                                               {OpTy}, Op, Op, {}, B);
+          GR->addAssignPtrTypeInstr(Op, AssignCI);
+        }
+      }
     }
   }
 }
@@ -1368,14 +1407,12 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
       continue;
 
     insertAssignPtrTypeIntrs(I, B);
+    deduceOperandElementType(I);
     insertAssignTypeIntrs(I, B);
     insertPtrCastOrAssignTypeInstr(I, B);
     insertSpirvDecorations(I, B);
   }
 
-  for (auto &I : instructions(Func))
-    deduceOperandElementType(&I);
-
   for (auto *I : Worklist) {
     TrackConstants = true;
     if (!I->getType()->isVoidTy() || isa<StoreInst>(I))
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index ef0973d03d15..db01f68f48de 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -73,8 +73,11 @@ class SPIRVGlobalRegistry {
   // untyped pointers.
   DenseMap<Value *, Type *> DeducedElTys;
   // Maps composite values to deduced types where untyped pointers are replaced
-  // with typed ones
+  // with typed ones.
   DenseMap<Value *, Type *> DeducedNestedTys;
+  // Maps values to "assign type" calls, thus being a registry of created
+  // Intrinsic::spv_assign_ptr_type instructions.
+  DenseMap<Value *, CallInst *> AssignPtrTypeInstr;
 
   // Add a new OpTypeXXX instruction without checking for duplicates.
   SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
@@ -149,6 +152,17 @@ public:
     return It == FunResPointerTypes.end() ? nullptr : It->second;
   }
 
+  // A registry of "assign type" records:
+  // - Add a record.
+  void addAssignPtrTypeInstr(Value *Val, CallInst *AssignPtrTyCI) {
+    AssignPtrTypeInstr[Val] = AssignPtrTyCI;
+  }
+  // - Find a record.
+  CallInst *findAssignPtrTypeInstr(const Value *Val) {
+    auto It = AssignPtrTypeInstr.find(Val);
+    return It == AssignPtrTypeInstr.end() ? nullptr : It->second;
+  }
+
   // Deduced element types of untyped pointers and composites:
   // - Add a record to the map of deduced element types.
   void addDeducedElementType(Value *Val, Type *Ty) { DeducedElTys[Val] = Ty; }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 3d536085b78a..a0a253c23b1e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -417,7 +417,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
         MachineInstr *Def = MRI.getVRegDef(Reg);
         assert(Def && "Expecting an instruction that defines the register");
         // G_GLOBAL_VALUE already has type info.
-        if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE)
+        if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE &&
+            Def->getOpcode() != SPIRV::ASSIGN_TYPE)
           insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB,
                             MF.getRegInfo());
         ToErase.push_back(&MI);
@@ -427,7 +428,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
         MachineInstr *Def = MRI.getVRegDef(Reg);
         assert(Def && "Expecting an instruction that defines the register");
         // G_GLOBAL_VALUE already has type info.
-        if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE)
+        if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE &&
+            Def->getOpcode() != SPIRV::ASSIGN_TYPE)
           insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MF.getRegInfo());
         ToErase.push_back(&MI);
       } else if (MIOp == TargetOpcode::G_CONSTANT ||
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 8e2063121e00..f5bc584ac4e1 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -178,14 +178,15 @@ static wasm::WasmLimits DefaultLimits() {
 }
 
 static MCSymbolWasm *GetOrCreateFunctionTableSymbol(MCContext &Ctx,
-                                                    const StringRef &Name) {
+                                                    const StringRef &Name,
+                                                    bool is64) {
   MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
   if (Sym) {
     if (!Sym->isFunctionTable())
       Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
   } else {
     Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
-    Sym->setFunctionTable();
+    Sym->setFunctionTable(is64);
     // The default function table is synthesized by the linker.
     Sym->setUndefined();
   }
@@ -258,7 +259,7 @@ public:
     MCAsmParserExtension::Initialize(Parser);
 
     DefaultFunctionTable = GetOrCreateFunctionTableSymbol(
-        getContext(), "__indirect_function_table");
+        getContext(), "__indirect_function_table", is64);
     if (!STI->checkFeatures("+reference-types"))
       DefaultFunctionTable->setOmitFromLinkingSection();
   }
@@ -508,7 +509,7 @@ public:
       auto &Tok = Lexer.getTok();
       if (Tok.is(AsmToken::Identifier)) {
         auto *Sym =
-            GetOrCreateFunctionTableSymbol(getContext(), Tok.getString());
+            GetOrCreateFunctionTableSymbol(getContext(), Tok.getString(), is64);
         const auto *Val = MCSymbolRefExpr::create(Sym, getContext());
         *Op = std::make_unique<WebAssemblyOperand>(
             WebAssemblyOperand::Symbol, Tok.getLoc(), Tok.getEndLoc(),
@@ -836,6 +837,9 @@ public:
       // symbol
       auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
+      if (is64) {
+        Limits.Flags |= wasm::WASM_LIMITS_FLAG_IS_64;
+      }
       wasm::WasmTableType Type = {*ElemType, Limits};
       WasmSym->setTableType(Type);
       TOut.emitTableType(WasmSym);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index 5e7279808cce..c5a047ee47d7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -108,8 +108,9 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol(
     if (!Sym->isFunctionTable())
       Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
   } else {
+    bool is64 = Subtarget && Subtarget->getTargetTriple().isArch64Bit();
     Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
-    Sym->setFunctionTable();
+    Sym->setFunctionTable(is64);
     // The default function table is synthesized by the linker.
     Sym->setUndefined();
   }
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 0bf3294af92a..3933e82b718f 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5120,6 +5120,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::x86_tileloaddt164_internal: {
       if (!Subtarget->hasAMXTILE())
         break;
+      auto *MFI =
+          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+      MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
       unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
                          ? X86::PTILELOADDV
                          : X86::PTILELOADDT1V;
@@ -5201,6 +5204,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       break;
     }
     case Intrinsic::x86_tilestored64_internal: {
+      auto *MFI =
+          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+      MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
       unsigned Opc = X86::PTILESTOREDV;
       // _tile_stored_internal(row, col, buf, STRIDE, c)
       SDValue Base = Node->getOperand(4);
@@ -5228,6 +5234,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::x86_tilestored64: {
       if (!Subtarget->hasAMXTILE())
         break;
+      auto *MFI =
+          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+      MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
       unsigned Opc;
       switch (IntNo) {
       default: llvm_unreachable("Unexpected intrinsic!");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7d30de15f84d..3fbab3af32bb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -615,6 +615,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FSIN, VT, Action);
     setOperationAction(ISD::FCOS, VT, Action);
     setOperationAction(ISD::FSINCOS, VT, Action);
+    setOperationAction(ISD::FTAN, VT, Action);
     setOperationAction(ISD::FSQRT, VT, Action);
     setOperationAction(ISD::FPOW, VT, Action);
     setOperationAction(ISD::FLOG, VT, Action);
@@ -833,9 +834,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     // Always expand sin/cos functions even though x87 has an instruction.
+    // clang-format off
     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+    setOperationAction(ISD::FTAN   , MVT::f80, Expand);
+    // clang-format on
 
     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
@@ -888,11 +892,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FNEG, MVT::f128, Custom);
     setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
 
+    // clang-format off
     setOperationAction(ISD::FSIN,         MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FSIN,  MVT::f128, LibCall);
     setOperationAction(ISD::FCOS,         MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FCOS,  MVT::f128, LibCall);
     setOperationAction(ISD::FSINCOS,      MVT::f128, LibCall);
+    setOperationAction(ISD::FTAN,         MVT::f128, LibCall);
+    setOperationAction(ISD::STRICT_FTAN,  MVT::f128, LibCall);
+    // clang-format on
     // No STRICT_FSINCOS
     setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
@@ -944,9 +952,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
                    MVT::v4f32, MVT::v8f32,  MVT::v16f32,
                    MVT::v2f64, MVT::v4f64,  MVT::v8f64 }) {
+    // clang-format off
     setOperationAction(ISD::FSIN,      VT, Expand);
     setOperationAction(ISD::FSINCOS,   VT, Expand);
     setOperationAction(ISD::FCOS,      VT, Expand);
+    setOperationAction(ISD::FTAN,      VT, Expand);
     setOperationAction(ISD::FREM,      VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::FPOW,      VT, Expand);
@@ -956,6 +966,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FEXP,      VT, Expand);
     setOperationAction(ISD::FEXP2,     VT, Expand);
     setOperationAction(ISD::FEXP10,    VT, Expand);
+    // clang-format on
   }
 
   // First set operation action for all vector types to either promote
@@ -2473,7 +2484,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // function casting to f64 and calling `fmod`.
   if (Subtarget.is32Bit() &&
       (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
-    for (ISD::NodeType Op :
+    // clang-format off
+   for (ISD::NodeType Op :
          {ISD::FCEIL,  ISD::STRICT_FCEIL,
           ISD::FCOS,   ISD::STRICT_FCOS,
           ISD::FEXP,   ISD::STRICT_FEXP,
@@ -2482,9 +2494,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
           ISD::FLOG,   ISD::STRICT_FLOG,
           ISD::FLOG10, ISD::STRICT_FLOG10,
           ISD::FPOW,   ISD::STRICT_FPOW,
-          ISD::FSIN,   ISD::STRICT_FSIN})
+          ISD::FSIN,   ISD::STRICT_FSIN,
+          ISD::FTAN,   ISD::STRICT_FTAN})
       if (isOperationExpand(Op, MVT::f32))
         setOperationAction(Op, MVT::f32, Promote);
+  // clang-format on
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
@@ -26776,7 +26790,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
     case Intrinsic::swift_async_context_addr: {
       SDLoc dl(Op);
       auto &MF = DAG.getMachineFunction();
-      auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
+      auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
       if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
         MF.getFrameInfo().setFrameAddressIsTaken(true);
         X86FI->setHasSwiftAsyncContext(true);
@@ -36781,7 +36795,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   }
   case TargetOpcode::PREALLOCATED_SETUP: {
     assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
-    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
     MFI->setHasPreallocatedCall(true);
     int64_t PreallocatedId = MI.getOperand(0).getImm();
     size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
@@ -36798,7 +36812,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
     int64_t PreallocatedId = MI.getOperand(1).getImm();
     int64_t ArgIdx = MI.getOperand(2).getImm();
-    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
     size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
     LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
                       << ", arg offset " << ArgOffset << "\n");
@@ -36841,6 +36855,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     unsigned Imm = MI.getOperand(0).getImm();
     BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
     MI.eraseFromParent(); // The pseudo is gone now.
+    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
+    MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
+    return BB;
+  }
+  case X86::PTILEZEROV: {
+    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
+    MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
     return BB;
   }
   case X86::PTILELOADD:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index c47bee070e04..99deacc811a1 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -74,7 +74,7 @@ let SchedRW = [WriteSystem] in {
                                             GR16:$src2, opaquemem:$src3,
                                             TILE:$src4), []>;
     let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1,
-        canFoldAsLoad = 1 in
+        canFoldAsLoad = 1, usesCustomInserter = 1 in
       def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2),
                                 [(set TILE:$dst, (int_x86_tilezero_internal
                                   GR16:$src1, GR16:$src2))]>;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index b69058787a4e..079ac983a8a0 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -92,6 +92,14 @@ static bool isAMXIntrinsic(Value *I) {
   return false;
 }
 
+static bool containsAMXCode(Function &F) {
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB)
+      if (I.getType()->isX86_AMXTy())
+        return true;
+  return false;
+}
+
 static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB,
                                            Type *Ty) {
   Function &F = *BB->getParent();
@@ -1230,6 +1238,14 @@ public:
   }
 
   bool runOnFunction(Function &F) override {
+    // Performance optimization: most code doesn't use AMX, so return early if
+    // there are no instructions that produce AMX values. This is sufficient, as
+    // AMX arguments and constants are not allowed -- so any producer of an AMX
+    // value must be an instruction.
+    // TODO: find a cheaper way for this, without looking at all instructions.
+    if (!containsAMXCode(F))
+      return false;
+
     bool C = false;
     TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
     TargetLibraryInfo *TLI =
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index f27676a27e86..613722b398f4 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -19,6 +19,7 @@
 #include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -71,6 +72,10 @@ FunctionPass *llvm::createX86LowerTileCopyPass() {
 }
 
 bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  if (FuncInfo->getAMXProgModel() != AMXProgModelEnum::ManagedRA)
+    return false;
+
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   const X86InstrInfo *TII = ST.getInstrInfo();
   const TargetRegisterInfo *TRI = ST.getRegisterInfo();
@@ -81,26 +86,8 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    // There won't be a tile copy if neither tile register live in nor live out.
-    bool HasTileCopy = false;
-    for (const auto &LI : MBB.liveins()) {
-      if (TILERegs.test(LI.PhysReg)) {
-        HasTileCopy = true;
-        break;
-      }
-    }
     LiveRegUnits UsedRegs(*TRI);
     UsedRegs.addLiveOuts(MBB);
-    if (!HasTileCopy) {
-      for (auto RegT : TILERegs.set_bits()) {
-        if (UsedRegs.available(RegT)) {
-          HasTileCopy = true;
-          break;
-        }
-      }
-    }
-    if (!HasTileCopy)
-      continue;
     for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
       UsedRegs.stepBackward(MI);
       if (!MI.isCopy())
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index f6e853270e07..8aaa49945f9d 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -21,6 +21,8 @@
 
 namespace llvm {
 
+enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 };
+
 /// X86MachineFunctionInfo - This class is derived from MachineFunction and
 /// contains private X86 target-specific information for each MachineFunction.
 class X86MachineFunctionInfo : public MachineFunctionInfo {
@@ -96,6 +98,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// used to address arguments in a function using a base pointer.
   int SEHFramePtrSaveIndex = 0;
 
+  /// The AMX programing model used in the function.
+  AMXProgModelEnum AMXProgModel = AMXProgModelEnum::None;
+
   /// True if this function has a subset of CSRs that is handled explicitly via
   /// copies.
   bool IsSplitCSR = false;
@@ -219,6 +224,13 @@ public:
   int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
   void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
 
+  AMXProgModelEnum getAMXProgModel() const { return AMXProgModel; }
+  void setAMXProgModel(AMXProgModelEnum Model) {
+    assert((AMXProgModel == AMXProgModelEnum::None || AMXProgModel == Model) &&
+           "mixed model is not supported");
+    AMXProgModel = Model;
+  }
+
   SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
     return ForwardedMustTailRegParms;
   }
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 2d296771b1c0..186d4d84c251 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -620,11 +620,11 @@ def : WriteRes<WriteNop, []>;
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : ICXWriteResPair<WriteFHAdd,  [ICXPort5,ICXPort015], 6, [2,1], 3, 6>;
-defm : ICXWriteResPair<WriteFHAddY, [ICXPort5,ICXPort015], 6, [2,1], 3, 7>;
+defm : ICXWriteResPair<WriteFHAdd,  [ICXPort5,ICXPort01], 6, [2,1], 3, 6>;
+defm : ICXWriteResPair<WriteFHAddY, [ICXPort5,ICXPort01], 6, [2,1], 3, 7>;
 defm : ICXWriteResPair<WritePHAdd,  [ICXPort5,ICXPort05],  3, [2,1], 3, 5>;
-defm : ICXWriteResPair<WritePHAddX, [ICXPort5,ICXPort015], 3, [2,1], 3, 6>;
-defm : ICXWriteResPair<WritePHAddY, [ICXPort5,ICXPort015], 3, [2,1], 3, 7>;
+defm : ICXWriteResPair<WritePHAddX, [ICXPort15,ICXPort015], 3, [2,1], 3, 6>;
+defm : ICXWriteResPair<WritePHAddY, [ICXPort15,ICXPort015], 3, [2,1], 3, 7>;
 
 // Remaining instrs.
 
@@ -886,7 +886,7 @@ def ICXWriteResGroup37 : SchedWriteRes<[ICXPort0,ICXPort5]> {
 }
 def: InstRW<[ICXWriteResGroup37], (instregex "MMX_PH(ADD|SUB)SWrr")>;
 
-def ICXWriteResGroup38 : SchedWriteRes<[ICXPort5,ICXPort01]> {
+def ICXWriteResGroup38 : SchedWriteRes<[ICXPort15,ICXPort01]> {
   let Latency = 3;
   let NumMicroOps = 3;
   let ReleaseAtCycles = [2,1];
@@ -1739,13 +1739,13 @@ def ICXWriteResGroup137 : SchedWriteRes<[ICXPort23,ICXPort01]> {
 def: InstRW<[ICXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIrm",
                                               "(V?)CVTPS2PDrm")>;
 
-def ICXWriteResGroup143 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> {
+def ICXWriteResGroup143 : SchedWriteRes<[ICXPort15,ICXPort01,ICXPort23]> {
   let Latency = 9;
   let NumMicroOps = 4;
   let ReleaseAtCycles = [2,1,1];
 }
-def: InstRW<[ICXWriteResGroup143], (instregex "(V?)PHADDSWrm",
-                                              "(V?)PHSUBSWrm")>;
+def: InstRW<[ICXWriteResGroup143], (instrs PHADDSWrm, VPHADDSWrm,
+                                           PHSUBSWrm, VPHSUBSWrm)>;
 
 def ICXWriteResGroup146 : SchedWriteRes<[ICXPort1,ICXPort6,ICXPort23,ICXPort0156]> {
   let Latency = 9;
@@ -1842,7 +1842,7 @@ def: InstRW<[ICXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)",
                                               "VPEXPANDDZ128rm(b?)",
                                               "VPEXPANDQZ128rm(b?)")>;
 
-def ICXWriteResGroup154 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> {
+def ICXWriteResGroup154 : SchedWriteRes<[ICXPort15,ICXPort01,ICXPort23]> {
   let Latency = 10;
   let NumMicroOps = 4;
   let ReleaseAtCycles = [2,1,1];
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index a7dff0ecbcd9..4fded44085e8 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -615,8 +615,8 @@ def : WriteRes<WriteNop, []>;
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : SKXWriteResPair<WriteFHAdd,  [SKXPort5,SKXPort015], 6, [2,1], 3, 6>;
-defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort015], 6, [2,1], 3, 7>;
+defm : SKXWriteResPair<WriteFHAdd,  [SKXPort5,SKXPort01], 6, [2,1], 3, 6>;
+defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort01], 6, [2,1], 3, 7>;
 defm : SKXWriteResPair<WritePHAdd,  [SKXPort5,SKXPort05],  3, [2,1], 3, 5>;
 defm : SKXWriteResPair<WritePHAddX, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>;
 defm : SKXWriteResPair<WritePHAddY, [SKXPort5,SKXPort015], 3, [2,1], 3, 7>;
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 68155acd9e5b..b3b8486c604b 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -302,6 +302,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0x805", "cortex-a76") // Kryo 4xx/5xx Silver
         .Case("0xc00", "falkor")
         .Case("0xc01", "saphira")
+        .Case("0x001", "oryon-1")
         .Default("generic");
   if (Implementer == "0x53") { // Samsung Electronics Co., Ltd.
     // The Exynos chips have a convoluted ID scheme that doesn't seem to follow
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 7464237d26d4..60a784ef002f 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -124,6 +124,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
     {{"gfx1103"},   {"gfx1103"}, GK_GFX1103, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1150"},   {"gfx1150"}, GK_GFX1150, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1151"},   {"gfx1151"}, GK_GFX1151, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
+    {{"gfx1152"},   {"gfx1152"}, GK_GFX1152, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1200"},   {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1201"},   {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
 
@@ -275,6 +276,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX1103: return {11, 0, 3};
   case GK_GFX1150: return {11, 5, 0};
   case GK_GFX1151: return {11, 5, 1};
+  case GK_GFX1152: return {11, 5, 2};
   case GK_GFX1200: return {12, 0, 0};
   case GK_GFX1201: return {12, 0, 1};
 
@@ -341,6 +343,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["image-insts"] = true;
       Features["fp8-conversion-insts"] = true;
       break;
+    case GK_GFX1152:
     case GK_GFX1151:
     case GK_GFX1150:
     case GK_GFX1103:
@@ -542,6 +545,7 @@ static bool isWave32Capable(StringRef GPU, const Triple &T) {
     switch (parseArchAMDGCN(GPU)) {
     case GK_GFX1201:
     case GK_GFX1200:
+    case GK_GFX1152:
     case GK_GFX1151:
     case GK_GFX1150:
     case GK_GFX1103:
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 9a5732dca5b7..549d03645f93 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -419,7 +419,8 @@ struct AAReturnedFromReturnedValues : public BaseType {
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     StateType S(StateType::getBestState(this->getState()));
-    clampReturnedValueStates<AAType, StateType, IRAttributeKind, RecurseForSelectAndPHI>(
+    clampReturnedValueStates<AAType, StateType, IRAttributeKind,
+                             RecurseForSelectAndPHI>(
         A, *this, S,
         PropagateCallBaseContext ? this->getCallBaseContext() : nullptr);
     // TODO: If we know we visited all returned values, thus no are assumed
@@ -6973,10 +6974,9 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
     if (AI.LibraryFunctionId != LibFunc___kmpc_alloc_shared) {
       Instruction *CtxI = isa<InvokeInst>(AI.CB) ? AI.CB : AI.CB->getNextNode();
       if (!Explorer || !Explorer->findInContextOf(UniqueFree, CtxI)) {
-        LLVM_DEBUG(
-            dbgs()
-            << "[H2S] unique free call might not be executed with the allocation "
-            << *UniqueFree << "\n");
+        LLVM_DEBUG(dbgs() << "[H2S] unique free call might not be executed "
+                             "with the allocation "
+                          << *UniqueFree << "\n");
         return false;
       }
     }
@@ -10406,11 +10406,12 @@ struct AANoFPClassFloating : public AANoFPClassImpl {
 
 struct AANoFPClassReturned final
     : AAReturnedFromReturnedValues<AANoFPClass, AANoFPClassImpl,
-                                   AANoFPClassImpl::StateType, false, Attribute::None, false> {
+                                   AANoFPClassImpl::StateType, false,
+                                   Attribute::None, false> {
   AANoFPClassReturned(const IRPosition &IRP, Attributor &A)
       : AAReturnedFromReturnedValues<AANoFPClass, AANoFPClassImpl,
-                                     AANoFPClassImpl::StateType, false, Attribute::None, false>(
-            IRP, A) {}
+                                     AANoFPClassImpl::StateType, false,
+                                     Attribute::None, false>(IRP, A) {}
 
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 5fbdbc3a014f..92a9697720ef 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_component_library(LLVMipo
   DeadArgumentElimination.cpp
   ElimAvailExtern.cpp
   EmbedBitcodePass.cpp
+  ExpandVariadics.cpp
   ExtractGV.cpp
   ForceFunctionAttrs.cpp
   FunctionAttrs.cpp
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
new file mode 100644
index 000000000000..d340bc041ccd
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -0,0 +1,1012 @@
+//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a variadic function into a va_list
+// and fix up the call sites. The majority of the pass is target independent.
+// The exceptions are the va_list type itself and the rules for where to store
+// variables in memory such that va_arg can iterate over them given a va_list.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. That packing is
+// exactly what is done by va_start. Further, the transform from ... to va_list
+// replaced va_start with an operation to copy a va_list from the new argument,
+// which is exactly a va_copy. This is useful for reducing target-dependence.
+//
+// A va_list instance is a forward iterator, where the primary operation va_arg
+// is dereference-then-increment. This interface forces significant convergent
+// evolution between target specific implementations. The variation in runtime
+// data layout is limited to that representable by the iterator, parameterised
+// by the type passed to the va_arg instruction.
+//
+// Therefore the majority of the target specific subtlety is packing arguments
+// into a stack allocated buffer such that a va_list can be initialised with it
+// and the va_arg expansion for the target will find the arguments at runtime.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular target.
+//
+// There is one "clever" invariant in use. va_start intrinsics that are not
+// within a varidic functions are an error in the IR verifier. When this
+// transform moves blocks from a variadic function into a fixed arity one, it
+// moves va_start intrinsics along with everything else. That means that the
+// va_start intrinsics that need to be rewritten to use the trailing argument
+// are exactly those that are in non-variadic functions so no further state
+// is needed to distinguish those that need to be rewritten.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+
+cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption(
+    DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+    cl::init(ExpandVariadicsMode::Unspecified),
+    cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified",
+                          "Use the implementation defaults"),
+               clEnumValN(ExpandVariadicsMode::Disable, "disable",
+                          "Disable the pass entirely"),
+               clEnumValN(ExpandVariadicsMode::Optimize, "optimize",
+                          "Optimise without changing ABI"),
+               clEnumValN(ExpandVariadicsMode::Lowering, "lowering",
+                          "Change variadic calling convention")));
+
+bool commandLineOverride() {
+  return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified;
+}
+
+// Instances of this class encapsulate the target-dependant behaviour as a
+// function of triple. Implementing a new ABI is adding a case to the switch
+// in create(llvm::Triple) at the end of this file.
+// This class may end up instantiated in TargetMachine instances, keeping it
+// here for now until enough targets are implemented for the API to evolve.
+class VariadicABIInfo {
+protected:
+  VariadicABIInfo() = default;
+
+public:
+  static std::unique_ptr<VariadicABIInfo> create(const Triple &T);
+
+  // Allow overriding whether the pass runs on a per-target basis
+  virtual bool enableForTarget() = 0;
+
+  // Whether a valist instance is passed by value or by address
+  // I.e. does it need to be alloca'ed and stored into, or can
+  // it be passed directly in a SSA register
+  virtual bool vaListPassedInSSARegister() = 0;
+
+  // The type of a va_list iterator object
+  virtual Type *vaListType(LLVMContext &Ctx) = 0;
+
+  // The type of a va_list as a function argument as lowered by C
+  virtual Type *vaListParameterType(Module &M) = 0;
+
+  // Initialize an allocated va_list object to point to an already
+  // initialized contiguous memory region.
+  // Return the value to pass as the va_list argument
+  virtual Value *initializeVaList(Module &M, LLVMContext &Ctx,
+                                  IRBuilder<> &Builder, AllocaInst *VaList,
+                                  Value *Buffer) = 0;
+
+  struct VAArgSlotInfo {
+    Align DataAlign; // With respect to the call frame
+    bool Indirect;   // Passed via a pointer
+  };
+  virtual VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) = 0;
+
+  // Targets implemented so far all have the same trivial lowering for these
+  bool vaEndIsNop() { return true; }
+  bool vaCopyIsMemcpy() { return true; }
+
+  virtual ~VariadicABIInfo() = default;
+};
+
+// Module implements getFunction() which returns nullptr on missing declaration
+// and getOrInsertFunction which creates one when absent. Intrinsics.h only
+// implements getDeclaration which creates one when missing. Checking whether
+// an intrinsic exists thus inserts it in the module and it then needs to be
+// deleted again to clean up.
+// The right name for the two functions on intrinsics would match Module::,
+// but doing that in a single change would introduce nullptr dereferences
+// where currently there are none. The minimal collateral damage approach
+// would split the change over a release to help downstream branches. As it
+// is unclear what approach will be preferred, implementing the trivial
+// function here in the meantime to decouple from that discussion.
+Function *getPreexistingDeclaration(Module *M, Intrinsic::ID Id,
+                                    ArrayRef<Type *> Tys = {}) {
+  auto *FT = Intrinsic::getType(M->getContext(), Id, Tys);
+  return M->getFunction(Tys.empty() ? Intrinsic::getName(Id)
+                                    : Intrinsic::getName(Id, Tys, M, FT));
+}
+
+class ExpandVariadics : public ModulePass {
+
+  // The pass construction sets the default to optimize when called from middle
+  // end and lowering when called from the backend. The command line variable
+  // overrides that. This is useful for testing and debugging. It also allows
+  // building an applications with variadic functions wholly removed if one
+  // has sufficient control over the dependencies, e.g. a statically linked
+  // clang that has no variadic function calls remaining in the binary.
+
+public:
+  static char ID;
+  const ExpandVariadicsMode Mode;
+  std::unique_ptr<VariadicABIInfo> ABI;
+
+  ExpandVariadics(ExpandVariadicsMode Mode)
+      : ModulePass(ID),
+        Mode(commandLineOverride() ? ExpandVariadicsModeOption : Mode) {}
+
+  StringRef getPassName() const override { return "Expand variadic functions"; }
+
+  bool rewriteABI() { return Mode == ExpandVariadicsMode::Lowering; }
+
+  bool runOnModule(Module &M) override;
+
+  bool runOnFunction(Module &M, IRBuilder<> &Builder, Function *F);
+
+  Function *replaceAllUsesWithNewDeclaration(Module &M,
+                                             Function *OriginalFunction);
+
+  Function *deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder,
+                                        Function *OriginalFunction);
+
+  Function *defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
+                                  Function *VariadicWrapper,
+                                  Function *FixedArityReplacement);
+
+  bool expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, FunctionType *,
+                  Function *NF);
+
+  // The intrinsic functions va_copy and va_end are removed unconditionally.
+  // They correspond to a memcpy and a no-op on all implemented targets.
+  // The va_start intrinsic is removed from basic blocks that were not created
+  // by this pass, some may remain if needed to maintain the external ABI.
+
+  template <Intrinsic::ID ID, typename InstructionType>
+  bool expandIntrinsicUsers(Module &M, IRBuilder<> &Builder,
+                            PointerType *IntrinsicArgType) {
+    bool Changed = false;
+    const DataLayout &DL = M.getDataLayout();
+    if (Function *Intrinsic =
+            getPreexistingDeclaration(&M, ID, {IntrinsicArgType})) {
+      for (User *U : make_early_inc_range(Intrinsic->users()))
+        if (auto *I = dyn_cast<InstructionType>(U))
+          Changed |= expandVAIntrinsicCall(Builder, DL, I);
+
+      if (Intrinsic->use_empty())
+        Intrinsic->eraseFromParent();
+    }
+    return Changed;
+  }
+
+  bool expandVAIntrinsicUsersWithAddrspace(Module &M, IRBuilder<> &Builder,
+                                           unsigned Addrspace) {
+    auto &Ctx = M.getContext();
+    PointerType *IntrinsicArgType = PointerType::get(Ctx, Addrspace);
+    bool Changed = false;
+
+    // expand vastart before vacopy as vastart may introduce a vacopy
+    Changed |= expandIntrinsicUsers<Intrinsic::vastart, VAStartInst>(
+        M, Builder, IntrinsicArgType);
+    Changed |= expandIntrinsicUsers<Intrinsic::vaend, VAEndInst>(
+        M, Builder, IntrinsicArgType);
+    Changed |= expandIntrinsicUsers<Intrinsic::vacopy, VACopyInst>(
+        M, Builder, IntrinsicArgType);
+    return Changed;
+  }
+
+  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+                             VAStartInst *Inst);
+
+  bool expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+                             VAEndInst *Inst);
+
+  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+                             VACopyInst *Inst);
+
+  FunctionType *inlinableVariadicFunctionType(Module &M, FunctionType *FTy) {
+    // The type of "FTy" with the ... removed and a va_list appended
+    SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+    ArgTypes.push_back(ABI->vaListParameterType(M));
+    return FunctionType::get(FTy->getReturnType(), ArgTypes,
+                             /*IsVarArgs=*/false);
+  }
+
+  static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
+                                   AllocaInst *Alloced) {
+    std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL);
+    uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0;
+    return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
+  }
+
+  bool expansionApplicableToFunction(Module &M, Function *F) {
+    if (F->isIntrinsic() || !F->isVarArg() ||
+        F->hasFnAttribute(Attribute::Naked))
+      return false;
+
+    if (F->getCallingConv() != CallingConv::C)
+      return false;
+
+    if (rewriteABI())
+      return true;
+
+    if (!F->hasExactDefinition())
+      return false;
+
+    return true;
+  }
+
+  bool expansionApplicableToFunctionCall(CallBase *CB) {
+    if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+      if (CI->isMustTailCall()) {
+        // Cannot expand musttail calls
+        return false;
+      }
+
+      if (CI->getCallingConv() != CallingConv::C)
+        return false;
+
+      return true;
+    }
+
+    if (isa<InvokeInst>(CB)) {
+      // Invoke not implemented in initial implementation of pass
+      return false;
+    }
+
+    // Other unimplemented derivative of CallBase
+    return false;
+  }
+
+  class ExpandedCallFrame {
+    // Helper for constructing an alloca instance containing the arguments bound
+    // to the variadic ... parameter, rearranged to allow indexing through a
+    // va_list iterator
+    enum { N = 4 };
+    SmallVector<Type *, N> FieldTypes;
+    enum Tag { Store, Memcpy, Padding };
+    SmallVector<std::tuple<Value *, uint64_t, Tag>, N> Source;
+
+    template <Tag tag> void append(Type *FieldType, Value *V, uint64_t Bytes) {
+      FieldTypes.push_back(FieldType);
+      Source.push_back({V, Bytes, tag});
+    }
+
+  public:
+    void store(LLVMContext &Ctx, Type *T, Value *V) { append<Store>(T, V, 0); }
+
+    void memcpy(LLVMContext &Ctx, Type *T, Value *V, uint64_t Bytes) {
+      append<Memcpy>(T, V, Bytes);
+    }
+
+    void padding(LLVMContext &Ctx, uint64_t By) {
+      append<Padding>(ArrayType::get(Type::getInt8Ty(Ctx), By), nullptr, 0);
+    }
+
+    size_t size() const { return FieldTypes.size(); }
+    bool empty() const { return FieldTypes.empty(); }
+
+    StructType *asStruct(LLVMContext &Ctx, StringRef Name) {
+      const bool IsPacked = true;
+      return StructType::create(Ctx, FieldTypes,
+                                (Twine(Name) + ".vararg").str(), IsPacked);
+    }
+
+    void initializeStructAlloca(const DataLayout &DL, IRBuilder<> &Builder,
+                                AllocaInst *Alloced) {
+
+      StructType *VarargsTy = cast<StructType>(Alloced->getAllocatedType());
+
+      for (size_t I = 0; I < size(); I++) {
+
+        auto [V, bytes, tag] = Source[I];
+
+        if (tag == Padding) {
+          assert(V == nullptr);
+          continue;
+        }
+
+        auto Dst = Builder.CreateStructGEP(VarargsTy, Alloced, I);
+
+        assert(V != nullptr);
+
+        if (tag == Store)
+          Builder.CreateStore(V, Dst);
+
+        if (tag == Memcpy)
+          Builder.CreateMemCpy(Dst, {}, V, {}, bytes);
+      }
+    }
+  };
+};
+
+bool ExpandVariadics::runOnModule(Module &M) {
+  bool Changed = false;
+  if (Mode == ExpandVariadicsMode::Disable)
+    return Changed;
+
+  Triple TT(M.getTargetTriple());
+  ABI = VariadicABIInfo::create(TT);
+  if (!ABI)
+    return Changed;
+
+  if (!ABI->enableForTarget())
+    return Changed;
+
+  auto &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+  IRBuilder<> Builder(Ctx);
+
+  // Lowering needs to run on all functions exactly once.
+  // Optimize could run on functions containing va_start exactly once.
+  for (Function &F : make_early_inc_range(M))
+    Changed |= runOnFunction(M, Builder, &F);
+
+  // After runOnFunction, all known calls to known variadic functions have been
+  // replaced. va_start intrinsics are presently (and invalidly!) only present
+  // in functions that used to be variadic and have now been replaced to take a
+  // va_list instead. If lowering as opposed to optimising, calls to unknown
+  // variadic functions have also been replaced.
+
+  {
+    // 0 and AllocaAddrSpace are sufficient for the targets implemented so far
+    unsigned Addrspace = 0;
+    Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace);
+
+    Addrspace = DL.getAllocaAddrSpace();
+    if (Addrspace != 0)
+      Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace);
+  }
+
+  if (Mode != ExpandVariadicsMode::Lowering)
+    return Changed;
+
+  for (Function &F : make_early_inc_range(M)) {
+    if (F.isDeclaration())
+      continue;
+
+    // Now need to track down indirect calls. Can't find those
+    // by walking uses of variadic functions, need to crawl the instruction
+    // stream. Fortunately this is only necessary for the ABI rewrite case.
+    for (BasicBlock &BB : F) {
+      for (Instruction &I : make_early_inc_range(BB)) {
+        if (CallBase *CB = dyn_cast<CallBase>(&I)) {
+          if (CB->isIndirectCall()) {
+            FunctionType *FTy = CB->getFunctionType();
+            if (FTy->isVarArg())
+              Changed |= expandCall(M, Builder, CB, FTy, 0);
+          }
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
+                                    Function *OriginalFunction) {
+  bool Changed = false;
+
+  if (!expansionApplicableToFunction(M, OriginalFunction))
+    return Changed;
+
+  [[maybe_unused]] const bool OriginalFunctionIsDeclaration =
+      OriginalFunction->isDeclaration();
+  assert(rewriteABI() || !OriginalFunctionIsDeclaration);
+
+  // Declare a new function and redirect every use to that new function
+  Function *VariadicWrapper =
+      replaceAllUsesWithNewDeclaration(M, OriginalFunction);
+  assert(VariadicWrapper->isDeclaration());
+  assert(OriginalFunction->use_empty());
+
+  // Create a new function taking va_list containing the implementation of the
+  // original
+  Function *FixedArityReplacement =
+      deriveFixedArityReplacement(M, Builder, OriginalFunction);
+  assert(OriginalFunction->isDeclaration());
+  assert(FixedArityReplacement->isDeclaration() ==
+         OriginalFunctionIsDeclaration);
+  assert(VariadicWrapper->isDeclaration());
+
+  // Create a single block forwarding wrapper that turns a ... into a va_list
+  [[maybe_unused]] Function *VariadicWrapperDefine =
+      defineVariadicWrapper(M, Builder, VariadicWrapper, FixedArityReplacement);
+  assert(VariadicWrapperDefine == VariadicWrapper);
+  assert(!VariadicWrapper->isDeclaration());
+
+  // We now have:
+  // 1. the original function, now as a declaration with no uses
+  // 2. a variadic function that unconditionally calls a fixed arity replacement
+  // 3. a fixed arity function equivalent to the original function
+
+  // Replace known calls to the variadic with calls to the va_list equivalent
+  for (User *U : make_early_inc_range(VariadicWrapper->users())) {
+    if (CallBase *CB = dyn_cast<CallBase>(U)) {
+      Value *calledOperand = CB->getCalledOperand();
+      if (VariadicWrapper == calledOperand)
+        Changed |=
+            expandCall(M, Builder, CB, VariadicWrapper->getFunctionType(),
+                       FixedArityReplacement);
+    }
+  }
+
+  // The original function will be erased.
+  // One of the two new functions will become a replacement for the original.
+  // When preserving the ABI, the other is an internal implementation detail.
+  // When rewriting the ABI, RAUW then the variadic one.
+  Function *const ExternallyAccessible =
+      rewriteABI() ? FixedArityReplacement : VariadicWrapper;
+  Function *const InternalOnly =
+      rewriteABI() ? VariadicWrapper : FixedArityReplacement;
+
+  // The external function is the replacement for the original
+  ExternallyAccessible->setLinkage(OriginalFunction->getLinkage());
+  ExternallyAccessible->setVisibility(OriginalFunction->getVisibility());
+  ExternallyAccessible->setComdat(OriginalFunction->getComdat());
+  ExternallyAccessible->takeName(OriginalFunction);
+
+  // Annotate the internal one as internal
+  InternalOnly->setVisibility(GlobalValue::DefaultVisibility);
+  InternalOnly->setLinkage(GlobalValue::InternalLinkage);
+
+  // The original is unused and obsolete
+  OriginalFunction->eraseFromParent();
+
+  InternalOnly->removeDeadConstantUsers();
+
+  if (rewriteABI()) {
+    // All known calls to the function have been removed by expandCall
+    // Resolve everything else by replaceAllUsesWith
+    VariadicWrapper->replaceAllUsesWith(FixedArityReplacement);
+    VariadicWrapper->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+Function *
+ExpandVariadics::replaceAllUsesWithNewDeclaration(Module &M,
+                                                  Function *OriginalFunction) {
+  auto &Ctx = M.getContext();
+  Function &F = *OriginalFunction;
+  FunctionType *FTy = F.getFunctionType();
+  Function *NF = Function::Create(FTy, F.getLinkage(), F.getAddressSpace());
+
+  NF->setName(F.getName() + ".varargs");
+  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
+
+  F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+
+  AttrBuilder ParamAttrs(Ctx);
+  AttributeList Attrs = NF->getAttributes();
+  Attrs = Attrs.addParamAttributes(Ctx, FTy->getNumParams(), ParamAttrs);
+  NF->setAttributes(Attrs);
+
+  OriginalFunction->replaceAllUsesWith(NF);
+  return NF;
+}
+
+Function *
+ExpandVariadics::deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder,
+                                             Function *OriginalFunction) {
+  Function &F = *OriginalFunction;
+  // The purpose here is split the variadic function F into two functions
+  // One is a variadic function that bundles the passed argument into a va_list
+  // and passes it to the second function. The second function does whatever
+  // the original F does, except that it takes a va_list instead of the ...
+
+  assert(expansionApplicableToFunction(M, &F));
+
+  auto &Ctx = M.getContext();
+
+  // Returned value isDeclaration() is equal to F.isDeclaration()
+  // but that property is not invariant throughout this function
+  const bool FunctionIsDefinition = !F.isDeclaration();
+
+  FunctionType *FTy = F.getFunctionType();
+  SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+  ArgTypes.push_back(ABI->vaListParameterType(M));
+
+  FunctionType *NFTy = inlinableVariadicFunctionType(M, FTy);
+  Function *NF = Function::Create(NFTy, F.getLinkage(), F.getAddressSpace());
+
+  // Note - same attribute handling as DeadArgumentElimination
+  NF->copyAttributesFrom(&F);
+  NF->setComdat(F.getComdat());
+  F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+  NF->setName(F.getName() + ".valist");
+  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
+
+  AttrBuilder ParamAttrs(Ctx);
+
+  AttributeList Attrs = NF->getAttributes();
+  Attrs = Attrs.addParamAttributes(Ctx, NFTy->getNumParams() - 1, ParamAttrs);
+  NF->setAttributes(Attrs);
+
+  // Splice the implementation into the new function with minimal changes
+  if (FunctionIsDefinition) {
+    NF->splice(NF->begin(), &F);
+
+    auto NewArg = NF->arg_begin();
+    for (Argument &Arg : F.args()) {
+      Arg.replaceAllUsesWith(NewArg);
+      NewArg->setName(Arg.getName()); // takeName without killing the old one
+      ++NewArg;
+    }
+    NewArg->setName("varargs");
+  }
+
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F.getAllMetadata(MDs);
+  for (auto [KindID, Node] : MDs)
+    NF->addMetadata(KindID, *Node);
+  F.clearMetadata();
+
+  return NF;
+}
+
+Function *
+ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder,
+                                       Function *VariadicWrapper,
+                                       Function *FixedArityReplacement) {
+  auto &Ctx = Builder.getContext();
+  const DataLayout &DL = M.getDataLayout();
+  assert(VariadicWrapper->isDeclaration());
+  Function &F = *VariadicWrapper;
+
+  assert(F.isDeclaration());
+  Type *VaListTy = ABI->vaListType(Ctx);
+
+  auto *BB = BasicBlock::Create(Ctx, "entry", &F);
+  Builder.SetInsertPoint(BB);
+
+  AllocaInst *VaListInstance =
+      Builder.CreateAlloca(VaListTy, nullptr, "va_start");
+
+  Builder.CreateLifetimeStart(VaListInstance,
+                              sizeOfAlloca(Ctx, DL, VaListInstance));
+
+  Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)},
+                          {VaListInstance});
+
+  SmallVector<Value *> Args;
+  for (Argument &A : F.args())
+    Args.push_back(&A);
+
+  Type *ParameterType = ABI->vaListParameterType(M);
+  if (ABI->vaListPassedInSSARegister())
+    Args.push_back(Builder.CreateLoad(ParameterType, VaListInstance));
+  else
+    Args.push_back(Builder.CreateAddrSpaceCast(VaListInstance, ParameterType));
+
+  CallInst *Result = Builder.CreateCall(FixedArityReplacement, Args);
+
+  Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)},
+                          {VaListInstance});
+  Builder.CreateLifetimeEnd(VaListInstance,
+                            sizeOfAlloca(Ctx, DL, VaListInstance));
+
+  if (Result->getType()->isVoidTy())
+    Builder.CreateRetVoid();
+  else
+    Builder.CreateRet(Result);
+
+  return VariadicWrapper;
+}
+
+bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
+                                 FunctionType *VarargFunctionType,
+                                 Function *NF) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+
+  if (!expansionApplicableToFunctionCall(CB)) {
+    if (rewriteABI())
+      report_fatal_error("Cannot lower callbase instruction");
+    return Changed;
+  }
+
+  // This is tricky. The call instruction's function type might not match
+  // the type of the caller. When optimising, can leave it unchanged.
+  // Webassembly detects that inconsistency and repairs it.
+  FunctionType *FuncType = CB->getFunctionType();
+  if (FuncType != VarargFunctionType) {
+    if (!rewriteABI())
+      return Changed;
+    FuncType = VarargFunctionType;
+  }
+
+  auto &Ctx = CB->getContext();
+
+  Align MaxFieldAlign(1);
+
+  // The strategy is to allocate a call frame containing the variadic
+  // arguments laid out such that a target specific va_list can be initialized
+  // with it, such that target specific va_arg instructions will correctly
+  // iterate over it. This means getting the alignment right and sometimes
+  // embedding a pointer to the value instead of embedding the value itself.
+
+  Function *CBF = CB->getParent()->getParent();
+
+  ExpandedCallFrame Frame;
+
+  uint64_t CurrentOffset = 0;
+
+  for (unsigned I = FuncType->getNumParams(), E = CB->arg_size(); I < E; ++I) {
+    Value *ArgVal = CB->getArgOperand(I);
+    const bool IsByVal = CB->paramHasAttr(I, Attribute::ByVal);
+    const bool IsByRef = CB->paramHasAttr(I, Attribute::ByRef);
+
+    // The type of the value being passed, decoded from byval/byref metadata if
+    // required
+    Type *const UnderlyingType = IsByVal   ? CB->getParamByValType(I)
+                                 : IsByRef ? CB->getParamByRefType(I)
+                                           : ArgVal->getType();
+    const uint64_t UnderlyingSize =
+        DL.getTypeAllocSize(UnderlyingType).getFixedValue();
+
+    // The type to be written into the call frame
+    Type *FrameFieldType = UnderlyingType;
+
+    // The value to copy from when initialising the frame alloca
+    Value *SourceValue = ArgVal;
+
+    VariadicABIInfo::VAArgSlotInfo SlotInfo = ABI->slotInfo(DL, UnderlyingType);
+
+    if (SlotInfo.Indirect) {
+      // The va_arg lowering loads through a pointer. Set up an alloca to aim
+      // that pointer at.
+      Builder.SetInsertPointPastAllocas(CBF);
+      Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
+      Value *CallerCopy =
+          Builder.CreateAlloca(UnderlyingType, nullptr, "IndirectAlloca");
+
+      Builder.SetInsertPoint(CB);
+      if (IsByVal)
+        Builder.CreateMemCpy(CallerCopy, {}, ArgVal, {}, UnderlyingSize);
+      else
+        Builder.CreateStore(ArgVal, CallerCopy);
+
+      // Indirection now handled, pass the alloca ptr by value
+      FrameFieldType = DL.getAllocaPtrType(Ctx);
+      SourceValue = CallerCopy;
+    }
+
+    // Alignment of the value within the frame
+    // This probably needs to be controllable as a function of type
+    Align DataAlign = SlotInfo.DataAlign;
+
+    MaxFieldAlign = std::max(MaxFieldAlign, DataAlign);
+
+    uint64_t DataAlignV = DataAlign.value();
+    if (uint64_t Rem = CurrentOffset % DataAlignV) {
+      // Inject explicit padding to deal with alignment requirements
+      uint64_t Padding = DataAlignV - Rem;
+      Frame.padding(Ctx, Padding);
+      CurrentOffset += Padding;
+    }
+
+    if (SlotInfo.Indirect) {
+      Frame.store(Ctx, FrameFieldType, SourceValue);
+    } else {
+      if (IsByVal)
+        Frame.memcpy(Ctx, FrameFieldType, SourceValue, UnderlyingSize);
+      else
+        Frame.store(Ctx, FrameFieldType, SourceValue);
+    }
+
+    CurrentOffset += DL.getTypeAllocSize(FrameFieldType).getFixedValue();
+  }
+
+  if (Frame.empty()) {
+    // Not passing any arguments, hopefully va_arg won't try to read any
+    // Creating a single byte frame containing nothing to point the va_list
+    // instance as that is less special-casey in the compiler and probably
+    // easier to interpret in a debugger.
+    Frame.padding(Ctx, 1);
+  }
+
+  StructType *VarargsTy = Frame.asStruct(Ctx, CBF->getName());
+
+  // The struct instance needs to be at least MaxFieldAlign for the alignment of
+  // the fields to be correct at runtime. Use the native stack alignment instead
+  // if that's greater as that tends to give better codegen.
+  // This is an awkward way to guess whether there is a known stack alignment
+  // without hitting an assert in DL.getStackAlignment, 1024 is an arbitrary
+  // number likely to be greater than the natural stack alignment.
+  // TODO: DL.getStackAlignment could return a MaybeAlign instead of assert
+  Align AllocaAlign = MaxFieldAlign;
+  if (DL.exceedsNaturalStackAlignment(Align(1024)))
+    AllocaAlign = std::max(AllocaAlign, DL.getStackAlignment());
+
+  // Put the alloca to hold the variadic args in the entry basic block.
+  Builder.SetInsertPointPastAllocas(CBF);
+
+  // SetCurrentDebugLocation when the builder SetInsertPoint method does not
+  Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
+
+  // The awkward construction here is to set the alignment on the instance
+  AllocaInst *Alloced = Builder.Insert(
+      new AllocaInst(VarargsTy, DL.getAllocaAddrSpace(), nullptr, AllocaAlign),
+      "vararg_buffer");
+  Changed = true;
+  assert(Alloced->getAllocatedType() == VarargsTy);
+
+  // Initialize the fields in the struct
+  Builder.SetInsertPoint(CB);
+  Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+  Frame.initializeStructAlloca(DL, Builder, Alloced);
+
+  const unsigned NumArgs = FuncType->getNumParams();
+  SmallVector<Value *> Args(CB->arg_begin(), CB->arg_begin() + NumArgs);
+
+  // Initialize a va_list pointing to that struct and pass it as the last
+  // argument
+  AllocaInst *VaList = nullptr;
+  {
+    if (!ABI->vaListPassedInSSARegister()) {
+      Type *VaListTy = ABI->vaListType(Ctx);
+      Builder.SetInsertPointPastAllocas(CBF);
+      Builder.SetCurrentDebugLocation(CB->getStableDebugLoc());
+      VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument");
+      Builder.SetInsertPoint(CB);
+      Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList));
+    }
+    Builder.SetInsertPoint(CB);
+    Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced));
+  }
+
+  // Attributes excluding any on the vararg arguments
+  AttributeList PAL = CB->getAttributes();
+  if (!PAL.isEmpty()) {
+    SmallVector<AttributeSet, 8> ArgAttrs;
+    for (unsigned ArgNo = 0; ArgNo < NumArgs; ArgNo++)
+      ArgAttrs.push_back(PAL.getParamAttrs(ArgNo));
+    PAL =
+        AttributeList::get(Ctx, PAL.getFnAttrs(), PAL.getRetAttrs(), ArgAttrs);
+  }
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CB->getOperandBundlesAsDefs(OpBundles);
+
+  CallBase *NewCB = nullptr;
+
+  if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+    Value *Dst = NF ? NF : CI->getCalledOperand();
+    FunctionType *NFTy = inlinableVariadicFunctionType(M, VarargFunctionType);
+
+    NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI);
+
+    CallInst::TailCallKind TCK = CI->getTailCallKind();
+    assert(TCK != CallInst::TCK_MustTail);
+
+    // Can't tail call a function that is being passed a pointer to an alloca
+    if (TCK == CallInst::TCK_Tail)
+      TCK = CallInst::TCK_None;
+    CI->setTailCallKind(TCK);
+
+  } else {
+    llvm_unreachable("Unreachable when !expansionApplicableToFunctionCall()");
+  }
+
+  if (VaList)
+    Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList));
+
+  Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+
+  NewCB->setAttributes(PAL);
+  NewCB->takeName(CB);
+  NewCB->setCallingConv(CB->getCallingConv());
+  NewCB->setDebugLoc(DebugLoc());
+
+  // DeadArgElim and ArgPromotion copy exactly this metadata
+  NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+
+  CB->replaceAllUsesWith(NewCB);
+  CB->eraseFromParent();
+  return Changed;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
+                                            const DataLayout &DL,
+                                            VAStartInst *Inst) {
+  // Only removing va_start instructions that are not in variadic functions.
+  // Those would be rejected by the IR verifier before this pass.
+  // After splicing basic blocks from a variadic function into a fixed arity
+  // one the va_start that used to refer to the ... parameter still exist.
+  // There are also variadic functions that this pass did not change and
+  // va_start instances in the created single block wrapper functions.
+  // Replace exactly the instances in non-variadic functions as those are
+  // the ones to be fixed up to use the va_list passed as the final argument.
+
+  Function *ContainingFunction = Inst->getFunction();
+  if (ContainingFunction->isVarArg()) {
+    return false;
+  }
+
+  // The last argument is a vaListParameterType, either a va_list
+  // or a pointer to one depending on the target.
+  bool PassedByValue = ABI->vaListPassedInSSARegister();
+  Argument *PassedVaList =
+      ContainingFunction->getArg(ContainingFunction->arg_size() - 1);
+
+  // va_start takes a pointer to a va_list, e.g. one on the stack
+  Value *VaStartArg = Inst->getArgList();
+
+  Builder.SetInsertPoint(Inst);
+
+  if (PassedByValue) {
+    // The general thing to do is create an alloca, store the va_list argument
+    // to it, then create a va_copy. When vaCopyIsMemcpy(), this optimises to a
+    // store to the VaStartArg.
+    assert(ABI->vaCopyIsMemcpy());
+    Builder.CreateStore(PassedVaList, VaStartArg);
+  } else {
+
+    // Otherwise emit a vacopy to pick up target-specific handling if any
+    auto &Ctx = Builder.getContext();
+
+    Builder.CreateIntrinsic(Intrinsic::vacopy, {DL.getAllocaPtrType(Ctx)},
+                            {VaStartArg, PassedVaList});
+  }
+
+  Inst->eraseFromParent();
+  return true;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+                                            VAEndInst *Inst) {
+  assert(ABI->vaEndIsNop());
+  Inst->eraseFromParent();
+  return true;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
+                                            const DataLayout &DL,
+                                            VACopyInst *Inst) {
+  assert(ABI->vaCopyIsMemcpy());
+  Builder.SetInsertPoint(Inst);
+
+  auto &Ctx = Builder.getContext();
+  Type *VaListTy = ABI->vaListType(Ctx);
+  uint64_t Size = DL.getTypeAllocSize(VaListTy).getFixedValue();
+
+  Builder.CreateMemCpy(Inst->getDest(), {}, Inst->getSrc(), {},
+                       Builder.getInt32(Size));
+
+  Inst->eraseFromParent();
+  return true;
+}
+
+struct Amdgpu final : public VariadicABIInfo {
+
+  bool enableForTarget() override { return true; }
+
+  bool vaListPassedInSSARegister() override { return true; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    return PointerType::getUnqual(M.getContext());
+  }
+
+  Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst * /*va_list*/, Value *Buffer) override {
+    // Given Buffer, which is an AllocInst of vararg_buffer
+    // need to return something usable as parameter type
+    return Builder.CreateAddrSpaceCast(Buffer, vaListParameterType(M));
+  }
+
+  VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override {
+    return {Align(4), false};
+  }
+};
+
+struct Wasm final : public VariadicABIInfo {
+
+  bool enableForTarget() override {
+    // Currently wasm is only used for testing.
+    return commandLineOverride();
+  }
+
+  bool vaListPassedInSSARegister() override { return true; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    return PointerType::getUnqual(M.getContext());
+  }
+
+  Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst * /*va_list*/, Value *Buffer) override {
+    return Buffer;
+  }
+
+  VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override {
+    LLVMContext &Ctx = Parameter->getContext();
+    const unsigned MinAlign = 4;
+    Align A = DL.getABITypeAlign(Parameter);
+    if (A < MinAlign)
+      A = Align(MinAlign);
+
+    if (auto s = dyn_cast<StructType>(Parameter)) {
+      if (s->getNumElements() > 1) {
+        return {DL.getABITypeAlign(PointerType::getUnqual(Ctx)), true};
+      }
+    }
+
+    return {A, false};
+  }
+};
+
+std::unique_ptr<VariadicABIInfo> VariadicABIInfo::create(const Triple &T) {
+  switch (T.getArch()) {
+  case Triple::r600:
+  case Triple::amdgcn: {
+    return std::make_unique<Amdgpu>();
+  }
+
+  case Triple::wasm32: {
+    return std::make_unique<Wasm>();
+  }
+
+  default:
+    return {};
+  }
+}
+
+} // namespace
+
+char ExpandVariadics::ID = 0;
+
+INITIALIZE_PASS(ExpandVariadics, DEBUG_TYPE, "Expand variadic functions", false,
+                false)
+
+ModulePass *llvm::createExpandVariadicsPass(ExpandVariadicsMode M) {
+  return new ExpandVariadics(M);
+}
+
+PreservedAnalyses ExpandVariadicsPass::run(Module &M, ModuleAnalysisManager &) {
+  return ExpandVariadics(Mode).runOnModule(M) ? PreservedAnalyses::none()
+                                              : PreservedAnalyses::all();
+}
+
+ExpandVariadicsPass::ExpandVariadicsPass(ExpandVariadicsMode M) : Mode(M) {}
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 03923b83cf34..f033d2b0d6d0 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -262,8 +262,70 @@ public:
     // TODO: Should this be a map (from Caller node) for more efficient lookup?
     std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
 
-    // The set of IDs for contexts including this node.
-    DenseSet<uint32_t> ContextIds;
+    // Get the list of edges from which we can compute allocation information
+    // such as the context ids and allocation type of this node.
+    const std::vector<std::shared_ptr<ContextEdge>> *
+    getEdgesWithAllocInfo() const {
+      // If node has any callees, compute from those, otherwise compute from
+      // callers (i.e. if this is the leaf allocation node).
+      if (!CalleeEdges.empty())
+        return &CalleeEdges;
+      if (!CallerEdges.empty()) {
+        // A node with caller edges but no callee edges must be the allocation
+        // node.
+        assert(IsAllocation);
+        return &CallerEdges;
+      }
+      return nullptr;
+    }
+
+    // Compute the context ids for this node from the union of its edge context
+    // ids.
+    DenseSet<uint32_t> getContextIds() const {
+      DenseSet<uint32_t> ContextIds;
+      auto *Edges = getEdgesWithAllocInfo();
+      if (!Edges)
+        return {};
+      unsigned Count = 0;
+      for (auto &Edge : *Edges)
+        Count += Edge->getContextIds().size();
+      ContextIds.reserve(Count);
+      for (auto &Edge : *Edges)
+        ContextIds.insert(Edge->getContextIds().begin(),
+                          Edge->getContextIds().end());
+      return ContextIds;
+    }
+
+    // Compute the allocation type for this node from the OR of its edge
+    // allocation types.
+    uint8_t computeAllocType() const {
+      auto *Edges = getEdgesWithAllocInfo();
+      if (!Edges)
+        return (uint8_t)AllocationType::None;
+      uint8_t BothTypes =
+          (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
+      uint8_t AllocType = (uint8_t)AllocationType::None;
+      for (auto &Edge : *Edges) {
+        AllocType |= Edge->AllocTypes;
+        // Bail early if alloc type reached both, no further refinement.
+        if (AllocType == BothTypes)
+          return AllocType;
+      }
+      return AllocType;
+    }
+
+    // The context ids set for this node is empty if its edge context ids are
+    // also all empty.
+    bool emptyContextIds() const {
+      auto *Edges = getEdgesWithAllocInfo();
+      if (!Edges)
+        return true;
+      for (auto &Edge : *Edges) {
+        if (!Edge->getContextIds().empty())
+          return false;
+      }
+      return true;
+    }
 
     // List of clones of this ContextNode, initially empty.
     std::vector<ContextNode *> Clones;
@@ -308,15 +370,11 @@ public:
     void printCall(raw_ostream &OS) const { Call.print(OS); }
 
     // True if this node was effectively removed from the graph, in which case
-    // its context id set, caller edges, and callee edges should all be empty.
+    // it should have an allocation type of None and empty context ids.
     bool isRemoved() const {
-      // Note that we can have non-empty context ids with empty caller and
-      // callee edges if the graph ends up with a single node.
-      if (ContextIds.empty())
-        assert(CalleeEdges.empty() && CallerEdges.empty() &&
-               "Context ids empty but at least one of callee and caller edges "
-               "were not!");
-      return ContextIds.empty();
+      assert((AllocTypes == (uint8_t)AllocationType::None) ==
+             emptyContextIds());
+      return AllocTypes == (uint8_t)AllocationType::None;
     }
 
     void dump() const;
@@ -429,7 +487,8 @@ private:
   /// else to its callers. Also updates OrigNode's edges to remove any context
   /// ids moved to the newly created edge.
   void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
-                      bool TowardsCallee);
+                      bool TowardsCallee,
+                      DenseSet<uint32_t> RemainingContextIds);
 
   /// Get the stack id corresponding to the given Id or Index (for IR this will
   /// return itself, for a summary index this will return the id recorded in the
@@ -958,7 +1017,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
 
   // Update alloc type and context ids for this MIB.
   AllocNode->AllocTypes |= (uint8_t)AllocType;
-  AllocNode->ContextIds.insert(LastContextId);
 
   // Now add or update nodes for each stack id in alloc's context.
   // Later when processing the stack ids on non-alloc callsites we will adjust
@@ -983,7 +1041,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     auto Ins = StackIdSet.insert(StackId);
     if (!Ins.second)
       StackNode->Recursive = true;
-    StackNode->ContextIds.insert(LastContextId);
     StackNode->AllocTypes |= (uint8_t)AllocType;
     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
     PrevNode = StackNode;
@@ -1034,7 +1091,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       // it resulted in any added ids to NextNode.
       if (!NewIdsToAdd.empty()) {
         Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
-        NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
         UpdateCallers(NextNode, Visited, UpdateCallers);
       }
     }
@@ -1043,21 +1099,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   DenseSet<const ContextEdge *> Visited;
   for (auto &Entry : AllocationCallToContextNodeMap) {
     auto *Node = Entry.second;
-    // Update ids on the allocation nodes before calling the recursive
-    // update along caller edges, since this simplifies the logic during
-    // that traversal.
-    DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
-    Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
     UpdateCallers(Node, Visited, UpdateCallers);
   }
 }
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
-    ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
-  // Make a copy of the context ids, since this will be adjusted below as they
-  // are moved.
-  DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
+    ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
+    // This must be passed by value to make a copy since it will be adjusted
+    // as ids are moved.
+    DenseSet<uint32_t> RemainingContextIds) {
   auto &OrigEdges =
       TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
   // Increment iterator in loop so that we can remove edges as needed.
@@ -1104,6 +1155,51 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
 }
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
+static void checkEdge(
+    const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
+  // Confirm that alloc type is not None and that we have at least one context
+  // id.
+  assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
+  assert(!Edge->ContextIds.empty());
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
+                      bool CheckEdges = true) {
+  if (Node->isRemoved())
+    return;
+#ifndef NDEBUG
+  // Compute node's context ids once for use in asserts.
+  auto NodeContextIds = Node->getContextIds();
+#endif
+  // Node's context ids should be the union of both its callee and caller edge
+  // context ids.
+  if (Node->CallerEdges.size()) {
+    DenseSet<uint32_t> CallerEdgeContextIds(
+        Node->CallerEdges.front()->ContextIds);
+    for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
+      if (CheckEdges)
+        checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
+      set_union(CallerEdgeContextIds, Edge->ContextIds);
+    }
+    // Node can have more context ids than callers if some contexts terminate at
+    // node and some are longer.
+    assert(NodeContextIds == CallerEdgeContextIds ||
+           set_is_subset(CallerEdgeContextIds, NodeContextIds));
+  }
+  if (Node->CalleeEdges.size()) {
+    DenseSet<uint32_t> CalleeEdgeContextIds(
+        Node->CalleeEdges.front()->ContextIds);
+    for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
+      if (CheckEdges)
+        checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
+      set_union(CalleeEdgeContextIds, Edge->getContextIds());
+    }
+    assert(NodeContextIds == CalleeEdgeContextIds);
+  }
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     assignStackNodesPostOrder(ContextNode *Node,
                               DenseSet<const ContextNode *> &Visited,
@@ -1178,7 +1274,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     // duplicated context ids. We have to recompute as we might have overlap
     // overlap between the saved context ids for different last nodes, and
     // removed them already during the post order traversal.
-    set_intersect(SavedContextIds, FirstNode->ContextIds);
+    set_intersect(SavedContextIds, FirstNode->getContextIds());
     ContextNode *PrevNode = nullptr;
     for (auto Id : Ids) {
       ContextNode *CurNode = getNodeForStackId(Id);
@@ -1211,18 +1307,17 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     ContextNode *NewNode = NodeOwner.back().get();
     NodeToCallingFunc[NewNode] = Func;
     NonAllocationCallToContextNodeMap[Call] = NewNode;
-    NewNode->ContextIds = SavedContextIds;
-    NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
+    NewNode->AllocTypes = computeAllocType(SavedContextIds);
 
     // Connect to callees of innermost stack frame in inlined call chain.
     // This updates context ids for FirstNode's callee's to reflect those
     // moved to NewNode.
-    connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
+    connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
 
     // Connect to callers of outermost stack frame in inlined call chain.
     // This updates context ids for FirstNode's caller's to reflect those
     // moved to NewNode.
-    connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
+    connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
 
     // Now we need to remove context ids from edges/nodes between First and
     // Last Node.
@@ -1234,18 +1329,32 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
 
       // Remove the context ids moved to NewNode from CurNode, and the
       // edge from the prior node.
-      set_subtract(CurNode->ContextIds, NewNode->ContextIds);
       if (PrevNode) {
         auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
         assert(PrevEdge);
-        set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
+        set_subtract(PrevEdge->getContextIds(), SavedContextIds);
         if (PrevEdge->getContextIds().empty()) {
           PrevNode->eraseCallerEdge(PrevEdge);
           CurNode->eraseCalleeEdge(PrevEdge);
         }
       }
+      // Since we update the edges from leaf to tail, only look at the callee
+      // edges. This isn't an alloc node, so if there are no callee edges, the
+      // alloc type is None.
+      CurNode->AllocTypes = CurNode->CalleeEdges.empty()
+                                ? (uint8_t)AllocationType::None
+                                : CurNode->computeAllocType();
       PrevNode = CurNode;
     }
+    if (VerifyNodes) {
+      checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
+      for (auto Id : Ids) {
+        ContextNode *CurNode = getNodeForStackId(Id);
+        // We should only have kept stack ids that had nodes.
+        assert(CurNode);
+        checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
+      }
+    }
   }
 }
 
@@ -1319,7 +1428,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
 
     // Initialize the context ids with the last node's. We will subsequently
     // refine the context ids by computing the intersection along all edges.
-    DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
+    DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
     assert(!LastNodeContextIds.empty());
 
     for (unsigned I = 0; I < Calls.size(); I++) {
@@ -1442,6 +1551,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   DenseSet<const ContextNode *> Visited;
   for (auto &Entry : AllocationCallToContextNodeMap)
     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
+  if (VerifyCCG)
+    check();
 }
 
 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
@@ -1786,8 +1897,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
     // First check if we have already synthesized a node for this tail call.
     if (TailCallToContextNodeMap.count(NewCall)) {
       NewNode = TailCallToContextNodeMap[NewCall];
-      NewNode->ContextIds.insert(Edge->ContextIds.begin(),
-                                 Edge->ContextIds.end());
       NewNode->AllocTypes |= Edge->AllocTypes;
     } else {
       FuncToCallsWithMetadata[Func].push_back({NewCall});
@@ -1797,7 +1906,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
       NewNode = NodeOwner.back().get();
       NodeToCallingFunc[NewNode] = Func;
       TailCallToContextNodeMap[NewCall] = NewNode;
-      NewNode->ContextIds = Edge->ContextIds;
       NewNode->AllocTypes = Edge->AllocTypes;
     }
 
@@ -2091,6 +2199,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
   OS << "\n";
   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
   OS << "\tContextIds:";
+  // Make a copy of the computed context ids that we can sort for stability.
+  auto ContextIds = getContextIds();
   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
   std::sort(SortedIds.begin(), SortedIds.end());
   for (auto Id : SortedIds)
@@ -2151,53 +2261,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
 }
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
-static void checkEdge(
-    const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
-  // Confirm that alloc type is not None and that we have at least one context
-  // id.
-  assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
-  assert(!Edge->ContextIds.empty());
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
-static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
-                      bool CheckEdges = true) {
-  if (Node->isRemoved())
-    return;
-  // Node's context ids should be the union of both its callee and caller edge
-  // context ids.
-  if (Node->CallerEdges.size()) {
-    auto EI = Node->CallerEdges.begin();
-    auto &FirstEdge = *EI;
-    EI++;
-    DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
-    for (; EI != Node->CallerEdges.end(); EI++) {
-      const auto &Edge = *EI;
-      if (CheckEdges)
-        checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
-      set_union(CallerEdgeContextIds, Edge->ContextIds);
-    }
-    // Node can have more context ids than callers if some contexts terminate at
-    // node and some are longer.
-    assert(Node->ContextIds == CallerEdgeContextIds ||
-           set_is_subset(CallerEdgeContextIds, Node->ContextIds));
-  }
-  if (Node->CalleeEdges.size()) {
-    auto EI = Node->CalleeEdges.begin();
-    auto &FirstEdge = *EI;
-    EI++;
-    DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
-    for (; EI != Node->CalleeEdges.end(); EI++) {
-      const auto &Edge = *EI;
-      if (CheckEdges)
-        checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
-      set_union(CalleeEdgeContextIds, Edge->ContextIds);
-    }
-    assert(Node->ContextIds == CalleeEdgeContextIds);
-  }
-}
-
-template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
   for (const auto Node : nodes<GraphType>(this)) {
@@ -2284,7 +2347,7 @@ struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
 
   static std::string getNodeAttributes(NodeRef Node, GraphType) {
     std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
-                                   getContextIds(Node->ContextIds) + "\"")
+                                   getContextIds(Node->getContextIds()) + "\"")
                                       .str();
     AttributeString +=
         (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
@@ -2443,16 +2506,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     set_subtract(Edge->ContextIds, ContextIdsToMove);
     Edge->AllocTypes = computeAllocType(Edge->ContextIds);
   }
-  // Now perform some updates that are common to all cases: the NewCallee gets
-  // the moved ids added, and we need to remove those ids from OldCallee and
-  // update its alloc type (NewCallee alloc type updates handled above).
-  NewCallee->ContextIds.insert(ContextIdsToMove.begin(),
-                               ContextIdsToMove.end());
-  set_subtract(OldCallee->ContextIds, ContextIdsToMove);
-  OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
-  // OldCallee alloc type should be None iff its context id set is now empty.
-  assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
-         OldCallee->ContextIds.empty());
   // Now walk the old callee node's callee edges and move Edge's context ids
   // over to the corresponding edge into the clone (which is created here if
   // this is a newly created clone).
@@ -2484,6 +2537,12 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     NewCallee->CalleeEdges.push_back(NewEdge);
     NewEdge->Callee->CallerEdges.push_back(NewEdge);
   }
+  // Recompute the node alloc type now that its callee edges have been
+  // updated (since we will compute from those edges).
+  OldCallee->AllocTypes = OldCallee->computeAllocType();
+  // OldCallee alloc type should be None iff its context id set is now empty.
+  assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
+         OldCallee->emptyContextIds());
   if (VerifyCCG) {
     checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
     checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
@@ -2528,7 +2587,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
   DenseSet<const ContextNode *> Visited;
   for (auto &Entry : AllocationCallToContextNodeMap) {
     Visited.clear();
-    identifyClones(Entry.second, Visited, Entry.second->ContextIds);
+    identifyClones(Entry.second, Visited, Entry.second->getContextIds());
   }
   Visited.clear();
   for (auto &Entry : AllocationCallToContextNodeMap)
@@ -2714,7 +2773,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
   }
 
   // We should still have some context ids on the original Node.
-  assert(!Node->ContextIds.empty());
+  assert(!Node->emptyContextIds());
 
   // Sanity check that no alloc types on node or edges are None.
   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
@@ -2918,7 +2977,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
       // find additional cloning is required.
       std::deque<ContextNode *> ClonesWorklist;
       // Ignore original Node if we moved all of its contexts to clones.
-      if (!Node->ContextIds.empty())
+      if (!Node->emptyContextIds())
         ClonesWorklist.push_back(Node);
       ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
                             Node->Clones.end());
@@ -3258,7 +3317,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
 
     // Skip if either no call to update, or if we ended up with no context ids
     // (we moved all edges onto other clones).
-    if (!Node->hasCall() || Node->ContextIds.empty())
+    if (!Node->hasCall() || Node->emptyContextIds())
       return;
 
     if (Node->IsAllocation) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 89193f8ff94b..38c1c2644554 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4745,6 +4745,29 @@ static Instruction *foldICmpAndXX(ICmpInst &I, const SimplifyQuery &Q,
                           Constant::getNullValue(Op1->getType()));
   }
 
+  if (!ICmpInst::isSigned(Pred))
+    return nullptr;
+
+  KnownBits KnownY = IC.computeKnownBits(A, /*Depth=*/0, &I);
+  // (X & NegY) spred X --> (X & NegY) upred X
+  if (KnownY.isNegative())
+    return new ICmpInst(ICmpInst::getUnsignedPredicate(Pred), Op0, Op1);
+
+  if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGT)
+    return nullptr;
+
+  if (KnownY.isNonNegative())
+    // (X & PosY) s<= X --> X s>= 0
+    // (X & PosY) s> X --> X s< 0
+    return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Op1,
+                        Constant::getNullValue(Op1->getType()));
+
+  if (isKnownNegative(Op1, IC.getSimplifyQuery().getWithInstruction(&I)))
+    // (NegX & Y) s<= NegX --> Y s< 0
+    // (NegX & Y) s> NegX --> Y s>= 0
+    return new ICmpInst(ICmpInst::getFlippedStrictnessPredicate(Pred), A,
+                        Constant::getNullValue(A->getType()));
+
   return nullptr;
 }
 
@@ -4772,7 +4795,7 @@ static Instruction *foldICmpOrXX(ICmpInst &I, const SimplifyQuery &Q,
   if (ICmpInst::isEquality(Pred) && Op0->hasOneUse()) {
     // icmp (X | Y) eq/ne Y --> (X & ~Y) eq/ne 0 if Y is freely invertible
     if (Value *NotOp1 =
-            IC.getFreelyInverted(Op1, Op1->hasOneUse(), &IC.Builder))
+            IC.getFreelyInverted(Op1, !Op1->hasNUsesOrMore(3), &IC.Builder))
       return new ICmpInst(Pred, IC.Builder.CreateAnd(A, NotOp1),
                           Constant::getNullValue(Op1->getType()));
     // icmp (X | Y) eq/ne Y --> (~X | Y) eq/ne -1 if X  is freely invertible.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 313beb7b6407..d2aaa5e23054 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1294,8 +1294,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
   // X == Y ? X : Z with X == Y ? Y : Z, as that would lead to an infinite
   // replacement cycle.
   Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
-  if (TrueVal != CmpLHS &&
-      isGuaranteedNotToBeUndefOrPoison(CmpRHS, SQ.AC, &Sel, &DT)) {
+  if (TrueVal != CmpLHS && isGuaranteedNotToBeUndef(CmpRHS, SQ.AC, &Sel, &DT)) {
     if (Value *V = simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ,
                                           /* AllowRefinement */ true))
       // Require either the replacement or the simplification result to be a
@@ -1316,8 +1315,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
       if (replaceInInstruction(TrueVal, CmpLHS, CmpRHS))
         return &Sel;
   }
-  if (TrueVal != CmpRHS &&
-      isGuaranteedNotToBeUndefOrPoison(CmpLHS, SQ.AC, &Sel, &DT))
+  if (TrueVal != CmpRHS && isGuaranteedNotToBeUndef(CmpLHS, SQ.AC, &Sel, &DT))
     if (Value *V = simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ,
                                           /* AllowRefinement */ true))
       if (isa<Constant>(CmpLHS) || isa<Constant>(V))
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 2aa21759d56e..a0e63bf12400 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -337,13 +337,17 @@ private:
                                  unsigned AccessSizeIndex,
                                  Instruction *InsertBefore, DomTreeUpdater &DTU,
                                  LoopInfo *LI);
-  bool ignoreMemIntrinsic(MemIntrinsic *MI);
+  bool ignoreMemIntrinsic(OptimizationRemarkEmitter &ORE, MemIntrinsic *MI);
   void instrumentMemIntrinsic(MemIntrinsic *MI);
   bool instrumentMemAccess(InterestingMemoryOperand &O, DomTreeUpdater &DTU,
                            LoopInfo *LI);
-  bool ignoreAccess(Instruction *Inst, Value *Ptr);
+  bool ignoreAccessWithoutRemark(Instruction *Inst, Value *Ptr);
+  bool ignoreAccess(OptimizationRemarkEmitter &ORE, Instruction *Inst,
+                    Value *Ptr);
+
   void getInterestingMemoryOperands(
-      Instruction *I, const TargetLibraryInfo &TLI,
+      OptimizationRemarkEmitter &ORE, Instruction *I,
+      const TargetLibraryInfo &TLI,
       SmallVectorImpl<InterestingMemoryOperand> &Interesting);
 
   void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
@@ -765,7 +769,8 @@ Value *HWAddressSanitizer::getShadowNonTls(IRBuilder<> &IRB) {
   return IRB.CreateLoad(PtrTy, GlobalDynamicAddress);
 }
 
-bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
+bool HWAddressSanitizer::ignoreAccessWithoutRemark(Instruction *Inst,
+                                                   Value *Ptr) {
   // Do not instrument accesses from different address spaces; we cannot deal
   // with them.
   Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
@@ -795,8 +800,23 @@ bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
   return false;
 }
 
+bool HWAddressSanitizer::ignoreAccess(OptimizationRemarkEmitter &ORE,
+                                      Instruction *Inst, Value *Ptr) {
+  bool Ignored = ignoreAccessWithoutRemark(Inst, Ptr);
+  if (Ignored) {
+    ORE.emit(
+        [&]() { return OptimizationRemark(DEBUG_TYPE, "ignoreAccess", Inst); });
+  } else {
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "ignoreAccess", Inst);
+    });
+  }
+  return Ignored;
+}
+
 void HWAddressSanitizer::getInterestingMemoryOperands(
-    Instruction *I, const TargetLibraryInfo &TLI,
+    OptimizationRemarkEmitter &ORE, Instruction *I,
+    const TargetLibraryInfo &TLI,
     SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
   // Skip memory accesses inserted by another instrumentation.
   if (I->hasMetadata(LLVMContext::MD_nosanitize))
@@ -807,22 +827,22 @@ void HWAddressSanitizer::getInterestingMemoryOperands(
     return;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (!ClInstrumentReads || ignoreAccess(I, LI->getPointerOperand()))
+    if (!ClInstrumentReads || ignoreAccess(ORE, I, LI->getPointerOperand()))
       return;
     Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
                              LI->getType(), LI->getAlign());
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (!ClInstrumentWrites || ignoreAccess(I, SI->getPointerOperand()))
+    if (!ClInstrumentWrites || ignoreAccess(ORE, I, SI->getPointerOperand()))
       return;
     Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
                              SI->getValueOperand()->getType(), SI->getAlign());
   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
-    if (!ClInstrumentAtomics || ignoreAccess(I, RMW->getPointerOperand()))
+    if (!ClInstrumentAtomics || ignoreAccess(ORE, I, RMW->getPointerOperand()))
       return;
     Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
                              RMW->getValOperand()->getType(), std::nullopt);
   } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
-    if (!ClInstrumentAtomics || ignoreAccess(I, XCHG->getPointerOperand()))
+    if (!ClInstrumentAtomics || ignoreAccess(ORE, I, XCHG->getPointerOperand()))
       return;
     Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
                              XCHG->getCompareOperand()->getType(),
@@ -830,7 +850,7 @@ void HWAddressSanitizer::getInterestingMemoryOperands(
   } else if (auto *CI = dyn_cast<CallInst>(I)) {
     for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ArgNo++) {
       if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
-          ignoreAccess(I, CI->getArgOperand(ArgNo)))
+          ignoreAccess(ORE, I, CI->getArgOperand(ArgNo)))
         continue;
       Type *Ty = CI->getParamByValType(ArgNo);
       Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
@@ -1035,13 +1055,14 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
         ->setSuccessor(0, TCI.TagMismatchTerm->getParent());
 }
 
-bool HWAddressSanitizer::ignoreMemIntrinsic(MemIntrinsic *MI) {
+bool HWAddressSanitizer::ignoreMemIntrinsic(OptimizationRemarkEmitter &ORE,
+                                            MemIntrinsic *MI) {
   if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
-    return (!ClInstrumentWrites || ignoreAccess(MTI, MTI->getDest())) &&
-           (!ClInstrumentReads || ignoreAccess(MTI, MTI->getSource()));
+    return (!ClInstrumentWrites || ignoreAccess(ORE, MTI, MTI->getDest())) &&
+           (!ClInstrumentReads || ignoreAccess(ORE, MTI, MTI->getSource()));
   }
   if (isa<MemSetInst>(MI))
-    return !ClInstrumentWrites || ignoreAccess(MI, MI->getDest());
+    return !ClInstrumentWrites || ignoreAccess(ORE, MI, MI->getDest());
   return false;
 }
 
@@ -1541,6 +1562,9 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   NumTotalFuncs++;
 
+  OptimizationRemarkEmitter &ORE =
+      FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
   if (selectiveInstrumentationShouldSkip(F, FAM))
     return;
 
@@ -1562,10 +1586,10 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
     if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
       LandingPadVec.push_back(&Inst);
 
-    getInterestingMemoryOperands(&Inst, TLI, OperandsToInstrument);
+    getInterestingMemoryOperands(ORE, &Inst, TLI, OperandsToInstrument);
 
     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst))
-      if (!ignoreMemIntrinsic(MI))
+      if (!ignoreMemIntrinsic(ORE, MI))
         IntrinToInstrument.push_back(MI);
   }
 
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index ba2546b8db0e..4371b821eae6 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -827,7 +827,8 @@ private:
         return false;
       }
 
-      if (Metrics.convergent) {
+      // FIXME: Allow jump threading with controlled convergence.
+      if (Metrics.Convergence != ConvergenceKind::None) {
         LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains "
                           << "convergent instructions.\n");
         ORE->emit([&]() {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 7b4c54370e48..f8e2f1f28088 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -327,8 +327,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   UnrollCostEstimator OuterUCE(L, TTI, EphValues, UP.BEInsns);
 
   if (!InnerUCE.canUnroll() || !OuterUCE.canUnroll()) {
-    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains instructions"
-                      << " which cannot be duplicated or have invalid cost.\n");
+    LLVM_DEBUG(dbgs() << "  Loop not considered unrollable\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -341,7 +340,10 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
     LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
     return LoopUnrollResult::Unmodified;
   }
-  if (InnerUCE.Convergent || OuterUCE.Convergent) {
+  // FIXME: The call to canUnroll() allows some controlled convergent
+  // operations, but we block them here for future changes.
+  if (InnerUCE.Convergence != ConvergenceKind::None ||
+      OuterUCE.Convergence != ConvergenceKind::None) {
     LLVM_DEBUG(
         dbgs() << "  Not unrolling loop with convergent instructions.\n");
     return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 10fc9e9303e8..cbc35b6dd429 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -684,11 +684,15 @@ UnrollCostEstimator::UnrollCostEstimator(
     const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
   CodeMetrics Metrics;
   for (BasicBlock *BB : L->blocks())
-    Metrics.analyzeBasicBlock(BB, TTI, EphValues);
+    Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false,
+                              L);
   NumInlineCandidates = Metrics.NumInlineCandidates;
   NotDuplicatable = Metrics.notDuplicatable;
-  Convergent = Metrics.convergent;
+  Convergence = Metrics.Convergence;
   LoopSize = Metrics.NumInsts;
+  ConvergenceAllowsRuntime =
+      Metrics.Convergence != ConvergenceKind::Uncontrolled &&
+      !getLoopConvergenceHeart(L);
 
   // Don't allow an estimate of size zero.  This would allows unrolling of loops
   // with huge iteration counts, which is a compile time problem even if it's
@@ -701,6 +705,25 @@ UnrollCostEstimator::UnrollCostEstimator(
     LoopSize = BEInsns + 1;
 }
 
+bool UnrollCostEstimator::canUnroll() const {
+  switch (Convergence) {
+  case ConvergenceKind::ExtendedLoop:
+    LLVM_DEBUG(dbgs() << "  Convergence prevents unrolling.\n");
+    return false;
+  default:
+    break;
+  }
+  if (!LoopSize.isValid()) {
+    LLVM_DEBUG(dbgs() << "  Invalid loop size prevents unrolling.\n");
+    return false;
+  }
+  if (NotDuplicatable) {
+    LLVM_DEBUG(dbgs() << "  Non-duplicatable blocks prevent unrolling.\n");
+    return false;
+  }
+  return true;
+}
+
 uint64_t UnrollCostEstimator::getUnrolledLoopSize(
     const TargetTransformInfo::UnrollingPreferences &UP,
     unsigned CountOverwrite) const {
@@ -1206,8 +1229,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
   UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
   if (!UCE.canUnroll()) {
-    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains instructions"
-                      << " which cannot be duplicated or have invalid cost.\n");
+    LLVM_DEBUG(dbgs() << "  Loop not considered unrollable.\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -1254,15 +1276,9 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   // is unsafe -- it adds a control-flow dependency to the convergent
   // operation.  Therefore restrict remainder loop (try unrolling without).
   //
-  // TODO: This is quite conservative.  In practice, convergent_op()
-  // is likely to be called unconditionally in the loop.  In this
-  // case, the program would be ill-formed (on most architectures)
-  // unless n were the same on all threads in a thread group.
-  // Assuming n is the same on all threads, any kind of unrolling is
-  // safe.  But currently llvm's notion of convergence isn't powerful
-  // enough to express this.
-  if (UCE.Convergent)
-    UP.AllowRemainder = false;
+  // TODO: This is somewhat conservative; we could allow the remainder if the
+  // trip count is uniform.
+  UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime;
 
   // Try to find the trip count upper bound if we cannot find the exact trip
   // count.
@@ -1282,6 +1298,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   if (!UP.Count)
     return LoopUnrollResult::Unmodified;
 
+  UP.Runtime &= UCE.ConvergenceAllowsRuntime;
+
   if (PP.PeelCount) {
     assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
     LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
@@ -1324,11 +1342,16 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
   // Unroll the loop.
   Loop *RemainderLoop = nullptr;
+  UnrollLoopOptions ULO;
+  ULO.Count = UP.Count;
+  ULO.Force = UP.Force;
+  ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount;
+  ULO.UnrollRemainder = UP.UnrollRemainder;
+  ULO.Runtime = UP.Runtime;
+  ULO.ForgetAllSCEV = ForgetAllSCEV;
+  ULO.Heart = getLoopConvergenceHeart(L);
   LoopUnrollResult UnrollResult = UnrollLoop(
-      L,
-      {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
-       UP.UnrollRemainder, ForgetAllSCEV},
-      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
+      L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
   if (UnrollResult == LoopUnrollResult::Unmodified)
     return LoopUnrollResult::Unmodified;
 
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index eb471b259c7d..cfe63496a100 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1221,7 +1221,6 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
       SmallPtrSet<const Value *, 4> ObjSet;
       SmallVector<Metadata *, 4> Scopes, NoAliases;
 
-      SmallSetVector<const Argument *, 4> NAPtrArgs;
       for (const Value *V : PtrArgs) {
         SmallVector<const Value *, 4> Objects;
         getUnderlyingObjects(V, Objects, /* LI = */ nullptr);
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 08ba65d9483e..3d950b151cd3 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -460,7 +460,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
                    L->dump());
         return Rotated;
       }
-      if (Metrics.convergent) {
+      if (Metrics.Convergence != ConvergenceKind::None) {
         LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
                    "instructions: ";
                    L->dump());
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 1216538195fb..90d7b99e9d81 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -419,6 +419,26 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
   }
 }
 
+// Loops containing convergent instructions that are uncontrolled or controlled
+// from outside the loop must have a count that divides their TripMultiple.
+LLVM_ATTRIBUTE_USED
+static bool canHaveUnrollRemainder(const Loop *L) {
+  if (getLoopConvergenceHeart(L))
+    return false;
+
+  // Check for uncontrolled convergent operations.
+  for (auto &BB : L->blocks()) {
+    for (auto &I : *BB) {
+      if (isa<ConvergenceControlInst>(I))
+        return true;
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (CB->isConvergent())
+          return CB->getConvergenceControlToken();
+    }
+  }
+  return true;
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form.  Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
 /// branch instruction. However, if the trip count (and multiple) are not known,
@@ -564,19 +584,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     return LoopUnrollResult::Unmodified;
   }
 
-  // Loops containing convergent instructions cannot use runtime unrolling,
-  // as the prologue/epilogue may add additional control-dependencies to
-  // convergent operations.
-  LLVM_DEBUG(
-      {
-        bool HasConvergent = false;
-        for (auto &BB : L->blocks())
-          for (auto &I : *BB)
-            if (auto *CB = dyn_cast<CallBase>(&I))
-              HasConvergent |= CB->isConvergent();
-        assert((!HasConvergent || !ULO.Runtime) &&
-               "Can't runtime unroll if loop contains a convergent operation.");
-      });
+  assert((!ULO.Runtime || canHaveUnrollRemainder(L)) &&
+         "Can't runtime unroll if loop contains a convergent operation.");
 
   bool EpilogProfitability =
       UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
@@ -722,7 +731,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       if (OldLoop)
         LoopsToSimplify.insert(NewLoops[OldLoop]);
 
-      if (*BB == Header)
+      if (*BB == Header) {
         // Loop over all of the PHI nodes in the block, changing them to use
         // the incoming values from the previous block.
         for (PHINode *OrigPHI : OrigPHINode) {
@@ -735,6 +744,16 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
           NewPHI->eraseFromParent();
         }
 
+        // Eliminate copies of the loop heart intrinsic, if any.
+        if (ULO.Heart) {
+          auto it = VMap.find(ULO.Heart);
+          assert(it != VMap.end());
+          Instruction *heartCopy = cast<Instruction>(it->second);
+          heartCopy->eraseFromParent();
+          VMap.erase(it);
+        }
+      }
+
       // Update our running map of newest clones
       LastValueMap[*BB] = New;
       for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index e1af02829c1d..dd7150bc63ec 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -1016,12 +1016,17 @@ bool llvm::UnrollRuntimeLoopRemainder(
   auto UnrollResult = LoopUnrollResult::Unmodified;
   if (remainderLoop && UnrollRemainder) {
     LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
-    UnrollResult =
-        UnrollLoop(remainderLoop,
-                   {/*Count*/ Count - 1, /*Force*/ false, /*Runtime*/ false,
-                    /*AllowExpensiveTripCount*/ false,
-                    /*UnrollRemainder*/ false, ForgetAllSCEV},
-                   LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
+    UnrollLoopOptions ULO;
+    ULO.Count = Count - 1;
+    ULO.Force = false;
+    ULO.Runtime = false;
+    ULO.AllowExpensiveTripCount = false;
+    ULO.UnrollRemainder = false;
+    ULO.ForgetAllSCEV = ForgetAllSCEV;
+    assert(!getLoopConvergenceHeart(L) &&
+           "A loop with a convergence heart does not allow runtime unrolling.");
+    UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI,
+                              /*ORE*/ nullptr, PreserveLCSSA);
   }
 
   if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 058746880743..d6b4acb2bdba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -274,6 +274,13 @@ m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
   return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
 }
 
+template <typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul,
+                             /* Commutative =*/true>
+m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_Binary<Instruction::Mul, Op0_t, Op1_t, true>(Op0, Op1);
+}
+
 /// Match a binary OR operation. Note that while conceptually the operands can
 /// be matched commutatively, \p Commutative defaults to false in line with the
 /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ab3b5cf2b9da..8ec67eb2f54b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1037,8 +1037,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
-  if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)),
-                            m_Mul(m_SpecificInt(1), m_VPValue(A)))))
+  if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
     return R.getVPSingleValue()->replaceAllUsesWith(A);
 }